chiark / gitweb /
Fix previous commit for !HAVE_AUDIT
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #ifdef HAVE_XATTR
46 #include <attr/xattr.h>
47 #endif
48
49 #include <systemd/sd-daemon.h>
50
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "sd-id128.h"
62 #include "dev-setup.h"
63 #include "fdset.h"
64 #include "build.h"
65 #include "fileio.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72         LINK_NO,
73         LINK_AUTO,
74         LINK_HOST,
75         LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static char **arg_controllers = NULL;
81 static char *arg_uuid = NULL;
82 static char *arg_machine = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88         (1ULL << CAP_CHOWN) |
89         (1ULL << CAP_DAC_OVERRIDE) |
90         (1ULL << CAP_DAC_READ_SEARCH) |
91         (1ULL << CAP_FOWNER) |
92         (1ULL << CAP_FSETID) |
93         (1ULL << CAP_IPC_OWNER) |
94         (1ULL << CAP_KILL) |
95         (1ULL << CAP_LEASE) |
96         (1ULL << CAP_LINUX_IMMUTABLE) |
97         (1ULL << CAP_NET_BIND_SERVICE) |
98         (1ULL << CAP_NET_BROADCAST) |
99         (1ULL << CAP_NET_RAW) |
100         (1ULL << CAP_SETGID) |
101         (1ULL << CAP_SETFCAP) |
102         (1ULL << CAP_SETPCAP) |
103         (1ULL << CAP_SETUID) |
104         (1ULL << CAP_SYS_ADMIN) |
105         (1ULL << CAP_SYS_CHROOT) |
106         (1ULL << CAP_SYS_NICE) |
107         (1ULL << CAP_SYS_PTRACE) |
108         (1ULL << CAP_SYS_TTY_CONFIG) |
109         (1ULL << CAP_SYS_RESOURCE) |
110         (1ULL << CAP_SYS_BOOT) |
111         (1ULL << CAP_AUDIT_WRITE) |
112         (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120                "  -h --help                Show this help\n"
121                "     --version             Print version string\n"
122                "  -D --directory=NAME      Root directory for the container\n"
123                "  -b --boot                Boot up full system (i.e. invoke init)\n"
124                "  -u --user=USER           Run the command under specified user or uid\n"
125                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
126                "                           cgroup hierarchies\n"
127                "     --uuid=UUID           Set a specific machine UUID for the container\n"
128                "  -M --machine=NAME        Set the machine name for the container\n"
129                "     --private-network     Disable network in container\n"
130                "     --read-only           Mount the root directory read-only\n"
131                "     --capability=CAP      In addition to the default, retain specified\n"
132                "                           capability\n"
133                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
134                "  -j                       Equivalent to --link-journal=host\n"
135                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
136                "                           the container\n"
137                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
138                program_invocation_short_name);
139
140         return 0;
141 }
142
143 static int parse_argv(int argc, char *argv[]) {
144
145         enum {
146                 ARG_VERSION = 0x100,
147                 ARG_PRIVATE_NETWORK,
148                 ARG_UUID,
149                 ARG_READ_ONLY,
150                 ARG_CAPABILITY,
151                 ARG_LINK_JOURNAL,
152                 ARG_BIND,
153                 ARG_BIND_RO
154         };
155
156         static const struct option options[] = {
157                 { "help",            no_argument,       NULL, 'h'                 },
158                 { "version",         no_argument,       NULL, ARG_VERSION         },
159                 { "directory",       required_argument, NULL, 'D'                 },
160                 { "user",            required_argument, NULL, 'u'                 },
161                 { "controllers",     required_argument, NULL, 'C'                 },
162                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
163                 { "boot",            no_argument,       NULL, 'b'                 },
164                 { "uuid",            required_argument, NULL, ARG_UUID            },
165                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
166                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
167                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
168                 { "bind",            required_argument, NULL, ARG_BIND            },
169                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
170                 { "machine",         required_argument, NULL, 'M'                 },
171                 { NULL,              0,                 NULL, 0                   }
172         };
173
174         int c;
175
176         assert(argc >= 0);
177         assert(argv);
178
179         while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
180
181                 switch (c) {
182
183                 case 'h':
184                         help();
185                         return 0;
186
187                 case ARG_VERSION:
188                         puts(PACKAGE_STRING);
189                         puts(SYSTEMD_FEATURES);
190                         return 0;
191
192                 case 'D':
193                         free(arg_directory);
194                         arg_directory = canonicalize_file_name(optarg);
195                         if (!arg_directory) {
196                                 log_error("Failed to canonicalize root directory.");
197                                 return -ENOMEM;
198                         }
199
200                         break;
201
202                 case 'u':
203                         free(arg_user);
204                         arg_user = strdup(optarg);
205                         if (!arg_user)
206                                 return log_oom();
207
208                         break;
209
210                 case 'C':
211                         strv_free(arg_controllers);
212                         arg_controllers = strv_split(optarg, ",");
213                         if (!arg_controllers)
214                                 return log_oom();
215
216                         cg_shorten_controllers(arg_controllers);
217                         break;
218
219                 case ARG_PRIVATE_NETWORK:
220                         arg_private_network = true;
221                         break;
222
223                 case 'b':
224                         arg_boot = true;
225                         break;
226
227                 case ARG_UUID:
228                         if (!id128_is_valid(optarg)) {
229                                 log_error("Invalid UUID: %s", optarg);
230                                 return -EINVAL;
231                         }
232
233                         arg_uuid = optarg;
234                         break;
235
236                 case 'M':
237                         if (!hostname_is_valid(optarg)) {
238                                 log_error("Invalid machine name: %s", optarg);
239                                 return -EINVAL;
240                         }
241
242                         free(arg_machine);
243                         arg_machine = strdup(optarg);
244                         if (!arg_machine)
245                                 return log_oom();
246
247                         break;
248
249                 case ARG_READ_ONLY:
250                         arg_read_only = true;
251                         break;
252
253                 case ARG_CAPABILITY: {
254                         char *state, *word;
255                         size_t length;
256
257                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
258                                 cap_value_t cap;
259                                 char *t;
260
261                                 t = strndup(word, length);
262                                 if (!t)
263                                         return log_oom();
264
265                                 if (cap_from_name(t, &cap) < 0) {
266                                         log_error("Failed to parse capability %s.", t);
267                                         free(t);
268                                         return -EINVAL;
269                                 }
270
271                                 free(t);
272                                 arg_retain |= 1ULL << (uint64_t) cap;
273                         }
274
275                         break;
276                 }
277
278                 case 'j':
279                         arg_link_journal = LINK_GUEST;
280                         break;
281
282                 case ARG_LINK_JOURNAL:
283                         if (streq(optarg, "auto"))
284                                 arg_link_journal = LINK_AUTO;
285                         else if (streq(optarg, "no"))
286                                 arg_link_journal = LINK_NO;
287                         else if (streq(optarg, "guest"))
288                                 arg_link_journal = LINK_GUEST;
289                         else if (streq(optarg, "host"))
290                                 arg_link_journal = LINK_HOST;
291                         else {
292                                 log_error("Failed to parse link journal mode %s", optarg);
293                                 return -EINVAL;
294                         }
295
296                         break;
297
298                 case ARG_BIND:
299                 case ARG_BIND_RO: {
300                         _cleanup_free_ char *a = NULL, *b = NULL;
301                         char *e;
302                         char ***x;
303                         int r;
304
305                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
306
307                         e = strchr(optarg, ':');
308                         if (e) {
309                                 a = strndup(optarg, e - optarg);
310                                 b = strdup(e + 1);
311                         } else {
312                                 a = strdup(optarg);
313                                 b = strdup(optarg);
314                         }
315
316                         if (!a || !b)
317                                 return log_oom();
318
319                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
320                                 log_error("Invalid bind mount specification: %s", optarg);
321                                 return -EINVAL;
322                         }
323
324                         r = strv_extend(x, a);
325                         if (r < 0)
326                                 return r;
327
328                         r = strv_extend(x, b);
329                         if (r < 0)
330                                 return r;
331
332                         break;
333                 }
334
335                 case '?':
336                         return -EINVAL;
337
338                 default:
339                         log_error("Unknown option code %c", c);
340                         return -EINVAL;
341                 }
342         }
343
344         return 1;
345 }
346
347 static int mount_all(const char *dest) {
348
349         typedef struct MountPoint {
350                 const char *what;
351                 const char *where;
352                 const char *type;
353                 const char *options;
354                 unsigned long flags;
355                 bool fatal;
356         } MountPoint;
357
358         static const MountPoint mount_table[] = {
359                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
361                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
362                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
363                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
364                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
366                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
367 #ifdef HAVE_SELINUX
368                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
369                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
370 #endif
371         };
372
373         unsigned k;
374         int r = 0;
375
376         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377                 _cleanup_free_ char *where = NULL;
378                 int t;
379
380                 where = strjoin(dest, "/", mount_table[k].where, NULL);
381                 if (!where)
382                         return log_oom();
383
384                 t = path_is_mount_point(where, true);
385                 if (t < 0) {
386                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387
388                         if (r == 0)
389                                 r = t;
390
391                         continue;
392                 }
393
394                 /* Skip this entry if it is not a remount. */
395                 if (mount_table[k].what && t > 0)
396                         continue;
397
398                 mkdir_p(where, 0755);
399
400                 if (mount(mount_table[k].what,
401                           where,
402                           mount_table[k].type,
403                           mount_table[k].flags,
404                           mount_table[k].options) < 0 &&
405                     mount_table[k].fatal) {
406
407                         log_error("mount(%s) failed: %m", where);
408
409                         if (r == 0)
410                                 r = -errno;
411                 }
412         }
413
414         return r;
415 }
416
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
418         char **x, **y;
419
420         STRV_FOREACH_PAIR(x, y, l) {
421                 _cleanup_free_ char *where = NULL;
422
423                 where = strjoin(dest, "/", *y, NULL);
424                 if (!where)
425                         return log_oom();
426
427                 mkdir_p_label(where, 0755);
428
429                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
430                         log_error("mount(%s) failed: %m", where);
431                         return -errno;
432                 }
433
434                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
435                         log_error("mount(%s) failed: %m", where);
436                         return -errno;
437                 }
438         }
439
440         return 0;
441 }
442
443 static int setup_timezone(const char *dest) {
444         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
445         char *z, *y;
446         int r;
447
448         assert(dest);
449
450         /* Fix the timezone, if possible */
451         r = readlink_malloc("/etc/localtime", &p);
452         if (r < 0) {
453                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
454                 return 0;
455         }
456
457         z = path_startswith(p, "../usr/share/zoneinfo/");
458         if (!z)
459                 z = path_startswith(p, "/usr/share/zoneinfo/");
460         if (!z) {
461                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
462                 return 0;
463         }
464
465         where = strappend(dest, "/etc/localtime");
466         if (!where)
467                 return log_oom();
468
469         r = readlink_malloc(where, &q);
470         if (r >= 0) {
471                 y = path_startswith(q, "../usr/share/zoneinfo/");
472                 if (!y)
473                         y = path_startswith(q, "/usr/share/zoneinfo/");
474
475
476                 /* Already pointing to the right place? Then do nothing .. */
477                 if (y && streq(y, z))
478                         return 0;
479         }
480
481         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
482         if (!check)
483                 return log_oom();
484
485         if (access(check, F_OK) < 0) {
486                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
487                 return 0;
488         }
489
490         what = strappend("../usr/share/zoneinfo/", z);
491         if (!what)
492                 return log_oom();
493
494         unlink(where);
495         if (symlink(what, where) < 0) {
496                 log_error("Failed to correct timezone of container: %m");
497                 return 0;
498         }
499
500         return 0;
501 }
502
503 static int setup_resolv_conf(const char *dest) {
504         char _cleanup_free_ *where = NULL;
505         _cleanup_close_ int fd = -1;
506
507         assert(dest);
508
509         if (arg_private_network)
510                 return 0;
511
512         /* Fix resolv.conf, if possible */
513         where = strappend(dest, "/etc/resolv.conf");
514         if (!where)
515                 return log_oom();
516
517         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
518
519         /* We don't really care for the results of this really. If it
520          * fails, it fails, but meh... */
521         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
522                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
523         else
524                 if (mount("/etc/resolv.conf", where, "bind",
525                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
526                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
527                         return -errno;
528                 }
529
530         return 0;
531 }
532
533 static int setup_boot_id(const char *dest) {
534         _cleanup_free_ char *from = NULL, *to = NULL;
535         sd_id128_t rnd;
536         char as_uuid[37];
537         int r;
538
539         assert(dest);
540
541         /* Generate a new randomized boot ID, so that each boot-up of
542          * the container gets a new one */
543
544         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
545         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
546         if (!from || !to)
547                 return log_oom();
548
549         r = sd_id128_randomize(&rnd);
550         if (r < 0) {
551                 log_error("Failed to generate random boot id: %s", strerror(-r));
552                 return r;
553         }
554
555         snprintf(as_uuid, sizeof(as_uuid),
556                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
557                  SD_ID128_FORMAT_VAL(rnd));
558         char_array_0(as_uuid);
559
560         r = write_string_file(from, as_uuid);
561         if (r < 0) {
562                 log_error("Failed to write boot id: %s", strerror(-r));
563                 return r;
564         }
565
566         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
567                 log_error("Failed to bind mount boot id: %m");
568                 r = -errno;
569         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
570                 log_warning("Failed to make boot id read-only: %m");
571
572         unlink(from);
573         return r;
574 }
575
576 static int copy_devnodes(const char *dest) {
577
578         static const char devnodes[] =
579                 "null\0"
580                 "zero\0"
581                 "full\0"
582                 "random\0"
583                 "urandom\0"
584                 "tty\0";
585
586         const char *d;
587         int r = 0;
588         _cleanup_umask_ mode_t u;
589
590         assert(dest);
591
592         u = umask(0000);
593
594         NULSTR_FOREACH(d, devnodes) {
595                 struct stat st;
596                 _cleanup_free_ char *from = NULL, *to = NULL;
597
598                 asprintf(&from, "/dev/%s", d);
599                 asprintf(&to, "%s/dev/%s", dest, d);
600
601                 if (!from || !to) {
602                         log_oom();
603
604                         if (r == 0)
605                                 r = -ENOMEM;
606
607                         break;
608                 }
609
610                 if (stat(from, &st) < 0) {
611
612                         if (errno != ENOENT) {
613                                 log_error("Failed to stat %s: %m", from);
614                                 if (r == 0)
615                                         r = -errno;
616                         }
617
618                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
619
620                         log_error("%s is not a char or block device, cannot copy", from);
621                         if (r == 0)
622                                 r = -EIO;
623
624                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
625
626                         log_error("mknod(%s) failed: %m", dest);
627                         if (r == 0)
628                                 r = -errno;
629                 }
630         }
631
632         return r;
633 }
634
635 static int setup_ptmx(const char *dest) {
636         _cleanup_free_ char *p = NULL;
637
638         p = strappend(dest, "/dev/ptmx");
639         if (!p)
640                 return log_oom();
641
642         if (symlink("pts/ptmx", p) < 0) {
643                 log_error("Failed to create /dev/ptmx symlink: %m");
644                 return -errno;
645         }
646
647         return 0;
648 }
649
650 static int setup_dev_console(const char *dest, const char *console) {
651         struct stat st;
652         _cleanup_free_ char *to = NULL;
653         int r;
654         _cleanup_umask_ mode_t u;
655
656         assert(dest);
657         assert(console);
658
659         u = umask(0000);
660
661         if (stat(console, &st) < 0) {
662                 log_error("Failed to stat %s: %m", console);
663                 return -errno;
664
665         } else if (!S_ISCHR(st.st_mode)) {
666                 log_error("/dev/console is not a char device");
667                 return -EIO;
668         }
669
670         r = chmod_and_chown(console, 0600, 0, 0);
671         if (r < 0) {
672                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
673                 return r;
674         }
675
676         if (asprintf(&to, "%s/dev/console", dest) < 0)
677                 return log_oom();
678
679         /* We need to bind mount the right tty to /dev/console since
680          * ptys can only exist on pts file systems. To have something
681          * to bind mount things on we create a device node first, that
682          * has the right major/minor (note that the major minor
683          * doesn't actually matter here, since we mount it over
684          * anyway). */
685
686         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
687                 log_error("mknod() for /dev/console failed: %m");
688                 return -errno;
689         }
690
691         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
692                 log_error("Bind mount for /dev/console failed: %m");
693                 return -errno;
694         }
695
696         return 0;
697 }
698
699 static int setup_kmsg(const char *dest, int kmsg_socket) {
700         _cleanup_free_ char *from = NULL, *to = NULL;
701         int r, fd, k;
702         _cleanup_umask_ mode_t u;
703         union {
704                 struct cmsghdr cmsghdr;
705                 uint8_t buf[CMSG_SPACE(sizeof(int))];
706         } control = {};
707         struct msghdr mh = {
708                 .msg_control = &control,
709                 .msg_controllen = sizeof(control),
710         };
711         struct cmsghdr *cmsg;
712
713         assert(dest);
714         assert(kmsg_socket >= 0);
715
716         u = umask(0000);
717
718         /* We create the kmsg FIFO as /dev/kmsg, but immediately
719          * delete it after bind mounting it to /proc/kmsg. While FIFOs
720          * on the reading side behave very similar to /proc/kmsg,
721          * their writing side behaves differently from /dev/kmsg in
722          * that writing blocks when nothing is reading. In order to
723          * avoid any problems with containers deadlocking due to this
724          * we simply make /dev/kmsg unavailable to the container. */
725         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
726             asprintf(&to, "%s/proc/kmsg", dest) < 0)
727                 return log_oom();
728
729         if (mkfifo(from, 0600) < 0) {
730                 log_error("mkfifo() for /dev/kmsg failed: %m");
731                 return -errno;
732         }
733
734         r = chmod_and_chown(from, 0600, 0, 0);
735         if (r < 0) {
736                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
737                 return r;
738         }
739
740         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
741                 log_error("Bind mount for /proc/kmsg failed: %m");
742                 return -errno;
743         }
744
745         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
746         if (fd < 0) {
747                 log_error("Failed to open fifo: %m");
748                 return -errno;
749         }
750
751         cmsg = CMSG_FIRSTHDR(&mh);
752         cmsg->cmsg_level = SOL_SOCKET;
753         cmsg->cmsg_type = SCM_RIGHTS;
754         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
755         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
756
757         mh.msg_controllen = cmsg->cmsg_len;
758
759         /* Store away the fd in the socket, so that it stays open as
760          * long as we run the child */
761         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
762         close_nointr_nofail(fd);
763
764         if (k < 0) {
765                 log_error("Failed to send FIFO fd: %m");
766                 return -errno;
767         }
768
769         /* And now make the FIFO unavailable as /dev/kmsg... */
770         unlink(from);
771         return 0;
772 }
773
774 static int setup_hostname(void) {
775
776         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
777                 return -errno;
778
779         return 0;
780 }
781
782 static int setup_journal(const char *directory) {
783         sd_id128_t machine_id;
784         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
785         char *id;
786         int r;
787
788         if (arg_link_journal == LINK_NO)
789                 return 0;
790
791         p = strappend(directory, "/etc/machine-id");
792         if (!p)
793                 return log_oom();
794
795         r = read_one_line_file(p, &b);
796         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
797                 return 0;
798         else if (r < 0) {
799                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
800                 return r;
801         }
802
803         id = strstrip(b);
804         if (isempty(id) && arg_link_journal == LINK_AUTO)
805                 return 0;
806
807         /* Verify validity */
808         r = sd_id128_from_string(id, &machine_id);
809         if (r < 0) {
810                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
811                 return r;
812         }
813
814         free(p);
815         p = strappend("/var/log/journal/", id);
816         q = strjoin(directory, "/var/log/journal/", id, NULL);
817         if (!p || !q)
818                 return log_oom();
819
820         if (path_is_mount_point(p, false) > 0) {
821                 if (arg_link_journal != LINK_AUTO) {
822                         log_error("%s: already a mount point, refusing to use for journal", p);
823                         return -EEXIST;
824                 }
825
826                 return 0;
827         }
828
829         if (path_is_mount_point(q, false) > 0) {
830                 if (arg_link_journal != LINK_AUTO) {
831                         log_error("%s: already a mount point, refusing to use for journal", q);
832                         return -EEXIST;
833                 }
834
835                 return 0;
836         }
837
838         r = readlink_and_make_absolute(p, &d);
839         if (r >= 0) {
840                 if ((arg_link_journal == LINK_GUEST ||
841                      arg_link_journal == LINK_AUTO) &&
842                     path_equal(d, q)) {
843
844                         r = mkdir_p(q, 0755);
845                         if (r < 0)
846                                 log_warning("failed to create directory %s: %m", q);
847                         return 0;
848                 }
849
850                 if (unlink(p) < 0) {
851                         log_error("Failed to remove symlink %s: %m", p);
852                         return -errno;
853                 }
854         } else if (r == -EINVAL) {
855
856                 if (arg_link_journal == LINK_GUEST &&
857                     rmdir(p) < 0) {
858
859                         if (errno == ENOTDIR) {
860                                 log_error("%s already exists and is neither a symlink nor a directory", p);
861                                 return r;
862                         } else {
863                                 log_error("Failed to remove %s: %m", p);
864                                 return -errno;
865                         }
866                 }
867         } else if (r != -ENOENT) {
868                 log_error("readlink(%s) failed: %m", p);
869                 return r;
870         }
871
872         if (arg_link_journal == LINK_GUEST) {
873
874                 if (symlink(q, p) < 0) {
875                         log_error("Failed to symlink %s to %s: %m", q, p);
876                         return -errno;
877                 }
878
879                 r = mkdir_p(q, 0755);
880                 if (r < 0)
881                         log_warning("failed to create directory %s: %m", q);
882                 return 0;
883         }
884
885         if (arg_link_journal == LINK_HOST) {
886                 r = mkdir_p(p, 0755);
887                 if (r < 0) {
888                         log_error("Failed to create %s: %m", p);
889                         return r;
890                 }
891
892         } else if (access(p, F_OK) < 0)
893                 return 0;
894
895         if (dir_is_empty(q) == 0) {
896                 log_error("%s not empty.", q);
897                 return -ENOTEMPTY;
898         }
899
900         r = mkdir_p(q, 0755);
901         if (r < 0) {
902                 log_error("Failed to create %s: %m", q);
903                 return r;
904         }
905
906         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
907                 log_error("Failed to bind mount journal from host into guest: %m");
908                 return -errno;
909         }
910
911         return 0;
912 }
913
914 static int setup_cgroup(const char *path) {
915         char **c;
916         int r;
917
918         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
919         if (r < 0) {
920                 log_error("Failed to create cgroup: %s", strerror(-r));
921                 return r;
922         }
923
924         STRV_FOREACH(c, arg_controllers) {
925                 r = cg_create_and_attach(*c, path, 1);
926                 if (r < 0)
927                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
928         }
929
930         return 0;
931 }
932
933 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
934 #ifdef HAVE_XATTR
935         _cleanup_free_ char *path = NULL;
936         char buf[DECIMAL_STR_MAX(pid_t)];
937         int r = 0, k;
938
939         assert(cgroup);
940         assert(pid >= 0);
941         assert(arg_directory);
942
943         assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
944
945         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
946         if (r < 0) {
947                 log_error("Failed to get path: %s", strerror(-r));
948                 return r;
949         }
950
951         r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
952         if (r < 0)
953                 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
954
955         if (uuid) {
956                 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
957                 if (k < 0) {
958                         log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
959                         if (r == 0)
960                                 r = k;
961                 }
962         }
963
964         k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
965         if (k < 0) {
966                 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
967                 if (r == 0)
968                         r = k;
969         }
970         return r;
971 #else
972         return 0;
973 #endif
974 }
975
976 static int drop_capabilities(void) {
977         return capability_bounding_set_drop(~arg_retain, false);
978 }
979
980 static int process_pty(int master, pid_t pid, sigset_t *mask) {
981
982         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
983         size_t in_buffer_full = 0, out_buffer_full = 0;
984         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
985         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
986         int ep = -1, signal_fd = -1, r;
987         bool tried_orderly_shutdown = false;
988
989         assert(master >= 0);
990         assert(pid > 0);
991         assert(mask);
992
993         fd_nonblock(STDIN_FILENO, 1);
994         fd_nonblock(STDOUT_FILENO, 1);
995         fd_nonblock(master, 1);
996
997         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
998         if (signal_fd < 0) {
999                 log_error("signalfd(): %m");
1000                 r = -errno;
1001                 goto finish;
1002         }
1003
1004         ep = epoll_create1(EPOLL_CLOEXEC);
1005         if (ep < 0) {
1006                 log_error("Failed to create epoll: %m");
1007                 r = -errno;
1008                 goto finish;
1009         }
1010
1011         /* We read from STDIN only if this is actually a TTY,
1012          * otherwise we assume non-interactivity. */
1013         if (isatty(STDIN_FILENO)) {
1014                 zero(stdin_ev);
1015                 stdin_ev.events = EPOLLIN|EPOLLET;
1016                 stdin_ev.data.fd = STDIN_FILENO;
1017
1018                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1019                         log_error("Failed to register STDIN in epoll: %m");
1020                         r = -errno;
1021                         goto finish;
1022                 }
1023         }
1024
1025         zero(stdout_ev);
1026         stdout_ev.events = EPOLLOUT|EPOLLET;
1027         stdout_ev.data.fd = STDOUT_FILENO;
1028
1029         zero(master_ev);
1030         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1031         master_ev.data.fd = master;
1032
1033         zero(signal_ev);
1034         signal_ev.events = EPOLLIN;
1035         signal_ev.data.fd = signal_fd;
1036
1037         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1038                 if (errno != EPERM) {
1039                         log_error("Failed to register stdout in epoll: %m");
1040                         r = -errno;
1041                         goto finish;
1042                 }
1043                 /* stdout without epoll support. Likely redirected to regular file. */
1044                 stdout_writable = true;
1045         }
1046
1047         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1048             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1049                 log_error("Failed to register fds in epoll: %m");
1050                 r = -errno;
1051                 goto finish;
1052         }
1053
1054         for (;;) {
1055                 struct epoll_event ev[16];
1056                 ssize_t k;
1057                 int i, nfds;
1058
1059                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1060                 if (nfds < 0) {
1061
1062                         if (errno == EINTR || errno == EAGAIN)
1063                                 continue;
1064
1065                         log_error("epoll_wait(): %m");
1066                         r = -errno;
1067                         goto finish;
1068                 }
1069
1070                 assert(nfds >= 1);
1071
1072                 for (i = 0; i < nfds; i++) {
1073                         if (ev[i].data.fd == STDIN_FILENO) {
1074
1075                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1076                                         stdin_readable = true;
1077
1078                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1079
1080                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1081                                         stdout_writable = true;
1082
1083                         } else if (ev[i].data.fd == master) {
1084
1085                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1086                                         master_readable = true;
1087
1088                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1089                                         master_writable = true;
1090
1091                         } else if (ev[i].data.fd == signal_fd) {
1092                                 struct signalfd_siginfo sfsi;
1093                                 ssize_t n;
1094
1095                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1096                                 if (n != sizeof(sfsi)) {
1097
1098                                         if (n >= 0) {
1099                                                 log_error("Failed to read from signalfd: invalid block size");
1100                                                 r = -EIO;
1101                                                 goto finish;
1102                                         }
1103
1104                                         if (errno != EINTR && errno != EAGAIN) {
1105                                                 log_error("Failed to read from signalfd: %m");
1106                                                 r = -errno;
1107                                                 goto finish;
1108                                         }
1109                                 } else {
1110
1111                                         if (sfsi.ssi_signo == SIGWINCH) {
1112                                                 struct winsize ws;
1113
1114                                                 /* The window size changed, let's forward that. */
1115                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1116                                                         ioctl(master, TIOCSWINSZ, &ws);
1117                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1118
1119                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1120
1121                                                 /* This only works for systemd... */
1122                                                 tried_orderly_shutdown = true;
1123                                                 kill(pid, SIGRTMIN+3);
1124
1125                                         } else {
1126                                                 r = 0;
1127                                                 goto finish;
1128                                         }
1129                                 }
1130                         }
1131                 }
1132
1133                 while ((stdin_readable && in_buffer_full <= 0) ||
1134                        (master_writable && in_buffer_full > 0) ||
1135                        (master_readable && out_buffer_full <= 0) ||
1136                        (stdout_writable && out_buffer_full > 0)) {
1137
1138                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1139
1140                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1141                                 if (k < 0) {
1142
1143                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1144                                                 stdin_readable = false;
1145                                         else {
1146                                                 log_error("read(): %m");
1147                                                 r = -errno;
1148                                                 goto finish;
1149                                         }
1150                                 } else
1151                                         in_buffer_full += (size_t) k;
1152                         }
1153
1154                         if (master_writable && in_buffer_full > 0) {
1155
1156                                 k = write(master, in_buffer, in_buffer_full);
1157                                 if (k < 0) {
1158
1159                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1160                                                 master_writable = false;
1161                                         else {
1162                                                 log_error("write(): %m");
1163                                                 r = -errno;
1164                                                 goto finish;
1165                                         }
1166
1167                                 } else {
1168                                         assert(in_buffer_full >= (size_t) k);
1169                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1170                                         in_buffer_full -= k;
1171                                 }
1172                         }
1173
1174                         if (master_readable && out_buffer_full < LINE_MAX) {
1175
1176                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1177                                 if (k < 0) {
1178
1179                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1180                                                 master_readable = false;
1181                                         else {
1182                                                 log_error("read(): %m");
1183                                                 r = -errno;
1184                                                 goto finish;
1185                                         }
1186                                 }  else
1187                                         out_buffer_full += (size_t) k;
1188                         }
1189
1190                         if (stdout_writable && out_buffer_full > 0) {
1191
1192                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1193                                 if (k < 0) {
1194
1195                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1196                                                 stdout_writable = false;
1197                                         else {
1198                                                 log_error("write(): %m");
1199                                                 r = -errno;
1200                                                 goto finish;
1201                                         }
1202
1203                                 } else {
1204                                         assert(out_buffer_full >= (size_t) k);
1205                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1206                                         out_buffer_full -= k;
1207                                 }
1208                         }
1209                 }
1210         }
1211
1212 finish:
1213         if (ep >= 0)
1214                 close_nointr_nofail(ep);
1215
1216         if (signal_fd >= 0)
1217                 close_nointr_nofail(signal_fd);
1218
1219         return r;
1220 }
1221
1222 static bool audit_enabled(void) {
1223 #ifdef HAVE_AUDIT
1224         int fd;
1225
1226         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1227         if (fd >= 0) {
1228                 close_nointr_nofail(fd);
1229                 return true;
1230         }
1231 #endif
1232         return false;
1233 }
1234
1235 int main(int argc, char *argv[]) {
1236         pid_t pid = 0;
1237         int r = EXIT_FAILURE, k;
1238         _cleanup_free_ char *newcg = NULL;
1239         _cleanup_close_ int master = -1;
1240         int n_fd_passed;
1241         const char *console = NULL;
1242         struct termios saved_attr, raw_attr;
1243         sigset_t mask;
1244         bool saved_attr_valid = false;
1245         struct winsize ws;
1246         int kmsg_socket_pair[2] = { -1, -1 };
1247         FDSet *fds = NULL;
1248
1249         log_parse_environment();
1250         log_open();
1251
1252         k = parse_argv(argc, argv);
1253         if (k < 0)
1254                 goto finish;
1255         else if (k == 0) {
1256                 r = EXIT_SUCCESS;
1257                 goto finish;
1258         }
1259
1260         if (arg_directory) {
1261                 char *p;
1262
1263                 p = path_make_absolute_cwd(arg_directory);
1264                 free(arg_directory);
1265                 arg_directory = p;
1266         } else
1267                 arg_directory = get_current_dir_name();
1268
1269         if (!arg_directory) {
1270                 log_error("Failed to determine path, please use -D.");
1271                 goto finish;
1272         }
1273
1274         path_kill_slashes(arg_directory);
1275
1276         if (!arg_machine) {
1277                 arg_machine = strdup(path_get_file_name(arg_directory));
1278                 if (!arg_machine) {
1279                         log_oom();
1280                         goto finish;
1281                 }
1282
1283                 hostname_cleanup(arg_machine, false);
1284                 if (isempty(arg_machine)) {
1285                         log_error("Failed to determine machine name automatically, please use -M.");
1286                         goto finish;
1287                 }
1288         }
1289
1290         if (geteuid() != 0) {
1291                 log_error("Need to be root.");
1292                 goto finish;
1293         }
1294
1295         if (sd_booted() <= 0) {
1296                 log_error("Not running on a systemd system.");
1297                 goto finish;
1298         }
1299
1300         if (audit_enabled()) {
1301                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1302                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1303                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1304                 sleep(5);
1305         }
1306
1307         if (path_equal(arg_directory, "/")) {
1308                 log_error("Spawning container on root directory not supported.");
1309                 goto finish;
1310         }
1311
1312         if (path_is_os_tree(arg_directory) <= 0) {
1313                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1314                 goto finish;
1315         }
1316
1317         log_close();
1318         n_fd_passed = sd_listen_fds(false);
1319         if (n_fd_passed > 0) {
1320                 k = fdset_new_listen_fds(&fds, false);
1321                 if (k < 0) {
1322                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1323                         goto finish;
1324                 }
1325         }
1326         fdset_close_others(fds);
1327         log_open();
1328
1329         k = cg_get_machine_path(arg_machine, &newcg);
1330         if (k < 0) {
1331                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1332                 goto finish;
1333         }
1334
1335         k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1336         if (k <= 0 && k != -ENOENT) {
1337                 log_error("Container already running.");
1338
1339                 free(newcg);
1340                 newcg = NULL;
1341
1342                 goto finish;
1343         }
1344
1345         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1346         if (master < 0) {
1347                 log_error("Failed to acquire pseudo tty: %m");
1348                 goto finish;
1349         }
1350
1351         console = ptsname(master);
1352         if (!console) {
1353                 log_error("Failed to determine tty name: %m");
1354                 goto finish;
1355         }
1356
1357         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1358
1359         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1360                 ioctl(master, TIOCSWINSZ, &ws);
1361
1362         if (unlockpt(master) < 0) {
1363                 log_error("Failed to unlock tty: %m");
1364                 goto finish;
1365         }
1366
1367         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1368                 saved_attr_valid = true;
1369
1370                 raw_attr = saved_attr;
1371                 cfmakeraw(&raw_attr);
1372                 raw_attr.c_lflag &= ~ECHO;
1373         }
1374
1375         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1376                 log_error("Failed to create kmsg socket pair.");
1377                 goto finish;
1378         }
1379
1380         sd_notify(0, "READY=1");
1381
1382         assert_se(sigemptyset(&mask) == 0);
1383         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1384         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1385
1386         for (;;) {
1387                 siginfo_t status;
1388                 int pipefd[2], pipefd2[2];
1389
1390                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1391                         log_error("pipe2(): %m");
1392                         goto finish;
1393                 }
1394
1395                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1396                         log_error("pipe2(): %m");
1397                         close_pipe(pipefd);
1398                         goto finish;
1399                 }
1400
1401                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1402                 if (pid < 0) {
1403                         if (errno == EINVAL)
1404                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1405                         else
1406                                 log_error("clone() failed: %m");
1407
1408                         goto finish;
1409                 }
1410
1411                 if (pid == 0) {
1412                         /* child */
1413                         const char *home = NULL;
1414                         uid_t uid = (uid_t) -1;
1415                         gid_t gid = (gid_t) -1;
1416                         unsigned n_env = 2;
1417                         const char *envp[] = {
1418                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1419                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1420                                 NULL, /* TERM */
1421                                 NULL, /* HOME */
1422                                 NULL, /* USER */
1423                                 NULL, /* LOGNAME */
1424                                 NULL, /* container_uuid */
1425                                 NULL, /* LISTEN_FDS */
1426                                 NULL, /* LISTEN_PID */
1427                                 NULL
1428                         };
1429
1430                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1431                         if (envp[n_env])
1432                                 n_env ++;
1433
1434                         /* Wait for the parent process to log our PID */
1435                         close_nointr_nofail(pipefd[1]);
1436                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1437                         close_nointr_nofail(pipefd[0]);
1438
1439                         close_nointr_nofail(master);
1440                         master = -1;
1441
1442                         if (saved_attr_valid) {
1443                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1444                                         log_error("Failed to set terminal attributes: %m");
1445                                         goto child_fail;
1446                                 }
1447                         }
1448
1449                         close_nointr(STDIN_FILENO);
1450                         close_nointr(STDOUT_FILENO);
1451                         close_nointr(STDERR_FILENO);
1452
1453                         close_nointr_nofail(kmsg_socket_pair[0]);
1454                         kmsg_socket_pair[0] = -1;
1455
1456                         reset_all_signal_handlers();
1457
1458                         assert_se(sigemptyset(&mask) == 0);
1459                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1460
1461                         k = open_terminal(console, O_RDWR);
1462                         if (k != STDIN_FILENO) {
1463                                 if (k >= 0) {
1464                                         close_nointr_nofail(k);
1465                                         k = -EINVAL;
1466                                 }
1467
1468                                 log_error("Failed to open console: %s", strerror(-k));
1469                                 goto child_fail;
1470                         }
1471
1472                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1473                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1474                                 log_error("Failed to duplicate console: %m");
1475                                 goto child_fail;
1476                         }
1477
1478                         if (setsid() < 0) {
1479                                 log_error("setsid() failed: %m");
1480                                 goto child_fail;
1481                         }
1482
1483                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1484                                 log_error("PR_SET_PDEATHSIG failed: %m");
1485                                 goto child_fail;
1486                         }
1487
1488                         if (setup_cgroup(newcg) < 0)
1489                                 goto child_fail;
1490
1491                         close_pipe(pipefd2);
1492
1493                         /* Mark everything as slave, so that we still
1494                          * receive mounts from the real root, but don't
1495                          * propagate mounts to the real root. */
1496                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1497                                 log_error("MS_SLAVE|MS_REC failed: %m");
1498                                 goto child_fail;
1499                         }
1500
1501                         /* Turn directory into bind mount */
1502                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1503                                 log_error("Failed to make bind mount.");
1504                                 goto child_fail;
1505                         }
1506
1507                         if (arg_read_only)
1508                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1509                                         log_error("Failed to make read-only.");
1510                                         goto child_fail;
1511                                 }
1512
1513                         if (mount_all(arg_directory) < 0)
1514                                 goto child_fail;
1515
1516                         if (copy_devnodes(arg_directory) < 0)
1517                                 goto child_fail;
1518
1519                         if (setup_ptmx(arg_directory) < 0)
1520                                 goto child_fail;
1521
1522                         dev_setup(arg_directory);
1523
1524                         if (setup_dev_console(arg_directory, console) < 0)
1525                                 goto child_fail;
1526
1527                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1528                                 goto child_fail;
1529
1530                         close_nointr_nofail(kmsg_socket_pair[1]);
1531                         kmsg_socket_pair[1] = -1;
1532
1533                         if (setup_boot_id(arg_directory) < 0)
1534                                 goto child_fail;
1535
1536                         if (setup_timezone(arg_directory) < 0)
1537                                 goto child_fail;
1538
1539                         if (setup_resolv_conf(arg_directory) < 0)
1540                                 goto child_fail;
1541
1542                         if (setup_journal(arg_directory) < 0)
1543                                 goto child_fail;
1544
1545                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1546                                 goto child_fail;
1547
1548                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1549                                 goto child_fail;
1550
1551                         if (chdir(arg_directory) < 0) {
1552                                 log_error("chdir(%s) failed: %m", arg_directory);
1553                                 goto child_fail;
1554                         }
1555
1556                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1557                                 log_error("mount(MS_MOVE) failed: %m");
1558                                 goto child_fail;
1559                         }
1560
1561                         if (chroot(".") < 0) {
1562                                 log_error("chroot() failed: %m");
1563                                 goto child_fail;
1564                         }
1565
1566                         if (chdir("/") < 0) {
1567                                 log_error("chdir() failed: %m");
1568                                 goto child_fail;
1569                         }
1570
1571                         umask(0022);
1572
1573                         loopback_setup();
1574
1575                         if (drop_capabilities() < 0) {
1576                                 log_error("drop_capabilities() failed: %m");
1577                                 goto child_fail;
1578                         }
1579
1580                         if (arg_user) {
1581
1582                                 /* Note that this resolves user names
1583                                  * inside the container, and hence
1584                                  * accesses the NSS modules from the
1585                                  * container and not the host. This is
1586                                  * a bit weird... */
1587
1588                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1589                                         log_error("get_user_creds() failed: %m");
1590                                         goto child_fail;
1591                                 }
1592
1593                                 if (mkdir_parents_label(home, 0775) < 0) {
1594                                         log_error("mkdir_parents_label() failed: %m");
1595                                         goto child_fail;
1596                                 }
1597
1598                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1599                                         log_error("mkdir_safe_label() failed: %m");
1600                                         goto child_fail;
1601                                 }
1602
1603                                 if (initgroups((const char*)arg_user, gid) < 0) {
1604                                         log_error("initgroups() failed: %m");
1605                                         goto child_fail;
1606                                 }
1607
1608                                 if (setresgid(gid, gid, gid) < 0) {
1609                                         log_error("setregid() failed: %m");
1610                                         goto child_fail;
1611                                 }
1612
1613                                 if (setresuid(uid, uid, uid) < 0) {
1614                                         log_error("setreuid() failed: %m");
1615                                         goto child_fail;
1616                                 }
1617                         } else {
1618                                 /* Reset everything fully to 0, just in case */
1619
1620                                 if (setgroups(0, NULL) < 0) {
1621                                         log_error("setgroups() failed: %m");
1622                                         goto child_fail;
1623                                 }
1624
1625                                 if (setresgid(0, 0, 0) < 0) {
1626                                         log_error("setregid() failed: %m");
1627                                         goto child_fail;
1628                                 }
1629
1630                                 if (setresuid(0, 0, 0) < 0) {
1631                                         log_error("setreuid() failed: %m");
1632                                         goto child_fail;
1633                                 }
1634                         }
1635
1636                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1637                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1638                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1639                                 log_oom();
1640                                 goto child_fail;
1641                         }
1642
1643                         if (arg_uuid) {
1644                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1645                                         log_oom();
1646                                         goto child_fail;
1647                                 }
1648                         }
1649
1650                         if (fdset_size(fds) > 0) {
1651                                 k = fdset_cloexec(fds, false);
1652                                 if (k < 0) {
1653                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1654                                         goto child_fail;
1655                                 }
1656
1657                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1658                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1659                                         log_oom();
1660                                         goto child_fail;
1661                                 }
1662                         }
1663
1664                         setup_hostname();
1665
1666                         if (arg_boot) {
1667                                 char **a;
1668                                 size_t l;
1669
1670                                 /* Automatically search for the init system */
1671
1672                                 l = 1 + argc - optind;
1673                                 a = newa(char*, l + 1);
1674                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1675
1676                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1677                                 execve(a[0], a, (char**) envp);
1678
1679                                 a[0] = (char*) "/lib/systemd/systemd";
1680                                 execve(a[0], a, (char**) envp);
1681
1682                                 a[0] = (char*) "/sbin/init";
1683                                 execve(a[0], a, (char**) envp);
1684                         } else if (argc > optind)
1685                                 execvpe(argv[optind], argv + optind, (char**) envp);
1686                         else {
1687                                 chdir(home ? home : "/root");
1688                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1689                         }
1690
1691                         log_error("execv() failed: %m");
1692
1693                 child_fail:
1694                         _exit(EXIT_FAILURE);
1695                 }
1696
1697                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1698                 close_nointr_nofail(pipefd[0]);
1699                 close_nointr_nofail(pipefd[1]);
1700
1701                 /* Wait for the child process to establish cgroup hierarchy */
1702                 close_nointr_nofail(pipefd2[1]);
1703                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1704                 close_nointr_nofail(pipefd2[0]);
1705
1706                 save_attributes(newcg, pid, arg_uuid, arg_directory);
1707
1708                 fdset_free(fds);
1709                 fds = NULL;
1710
1711                 if (process_pty(master, pid, &mask) < 0)
1712                         goto finish;
1713
1714                 if (saved_attr_valid)
1715                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1716
1717                 k = wait_for_terminate(pid, &status);
1718                 if (k < 0) {
1719                         r = EXIT_FAILURE;
1720                         break;
1721                 }
1722
1723                 if (status.si_code == CLD_EXITED) {
1724                         r = status.si_status;
1725                         if (status.si_status != 0) {
1726                                 log_error("Container failed with error code %i.", status.si_status);
1727                                 break;
1728                         }
1729
1730                         log_debug("Container exited successfully.");
1731                         break;
1732                 } else if (status.si_code == CLD_KILLED &&
1733                            status.si_status == SIGINT) {
1734                         log_info("Container has been shut down.");
1735                         r = 0;
1736                         break;
1737                 } else if (status.si_code == CLD_KILLED &&
1738                            status.si_status == SIGHUP) {
1739                         log_info("Container is being rebooted.");
1740                         continue;
1741                 } else if (status.si_code == CLD_KILLED ||
1742                            status.si_code == CLD_DUMPED) {
1743
1744                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1745                         r = EXIT_FAILURE;
1746                         break;
1747                 } else {
1748                         log_error("Container failed due to unknown reason.");
1749                         r = EXIT_FAILURE;
1750                         break;
1751                 }
1752         }
1753
1754 finish:
1755         if (saved_attr_valid)
1756                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1757
1758         close_pipe(kmsg_socket_pair);
1759
1760         if (newcg)
1761                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1762
1763         free(arg_directory);
1764         free(arg_machine);
1765         strv_free(arg_controllers);
1766
1767         fdset_free(fds);
1768
1769         return r;
1770 }