chiark / gitweb /
913e73673a5f5decf7a9cfd4520a5914efbeddbe
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <linux/netlink.h>
43 #include <sys/un.h>
44 #include <sys/socket.h>
45
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
48
49 #include "log.h"
50 #include "util.h"
51 #include "mkdir.h"
52 #include "macro.h"
53 #include "audit.h"
54 #include "missing.h"
55 #include "cgroup-util.h"
56 #include "strv.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
59 #include "sd-id128.h"
60 #include "dev-setup.h"
61 #include "fdset.h"
62 #include "build.h"
63 #include "fileio.h"
64 #include "bus-internal.h"
65 #include "bus-message.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72         LINK_NO,
73         LINK_AUTO,
74         LINK_HOST,
75         LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88         (1ULL << CAP_CHOWN) |
89         (1ULL << CAP_DAC_OVERRIDE) |
90         (1ULL << CAP_DAC_READ_SEARCH) |
91         (1ULL << CAP_FOWNER) |
92         (1ULL << CAP_FSETID) |
93         (1ULL << CAP_IPC_OWNER) |
94         (1ULL << CAP_KILL) |
95         (1ULL << CAP_LEASE) |
96         (1ULL << CAP_LINUX_IMMUTABLE) |
97         (1ULL << CAP_NET_BIND_SERVICE) |
98         (1ULL << CAP_NET_BROADCAST) |
99         (1ULL << CAP_NET_RAW) |
100         (1ULL << CAP_SETGID) |
101         (1ULL << CAP_SETFCAP) |
102         (1ULL << CAP_SETPCAP) |
103         (1ULL << CAP_SETUID) |
104         (1ULL << CAP_SYS_ADMIN) |
105         (1ULL << CAP_SYS_CHROOT) |
106         (1ULL << CAP_SYS_NICE) |
107         (1ULL << CAP_SYS_PTRACE) |
108         (1ULL << CAP_SYS_TTY_CONFIG) |
109         (1ULL << CAP_SYS_RESOURCE) |
110         (1ULL << CAP_SYS_BOOT) |
111         (1ULL << CAP_AUDIT_WRITE) |
112         (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120                "  -h --help                Show this help\n"
121                "     --version             Print version string\n"
122                "  -D --directory=NAME      Root directory for the container\n"
123                "  -b --boot                Boot up full system (i.e. invoke init)\n"
124                "  -u --user=USER           Run the command under specified user or uid\n"
125                "     --uuid=UUID           Set a specific machine UUID for the container\n"
126                "  -M --machine=NAME        Set the machine name for the container\n"
127                "  -S --slice=SLICE         Place the container in the specified slice\n"
128                "     --private-network     Disable network in container\n"
129                "     --read-only           Mount the root directory read-only\n"
130                "     --capability=CAP      In addition to the default, retain specified\n"
131                "                           capability\n"
132                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
133                "  -j                       Equivalent to --link-journal=host\n"
134                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
135                "                           the container\n"
136                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137                program_invocation_short_name);
138
139         return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144         enum {
145                 ARG_VERSION = 0x100,
146                 ARG_PRIVATE_NETWORK,
147                 ARG_UUID,
148                 ARG_READ_ONLY,
149                 ARG_CAPABILITY,
150                 ARG_LINK_JOURNAL,
151                 ARG_BIND,
152                 ARG_BIND_RO
153         };
154
155         static const struct option options[] = {
156                 { "help",            no_argument,       NULL, 'h'                 },
157                 { "version",         no_argument,       NULL, ARG_VERSION         },
158                 { "directory",       required_argument, NULL, 'D'                 },
159                 { "user",            required_argument, NULL, 'u'                 },
160                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
161                 { "boot",            no_argument,       NULL, 'b'                 },
162                 { "uuid",            required_argument, NULL, ARG_UUID            },
163                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
164                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
165                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
166                 { "bind",            required_argument, NULL, ARG_BIND            },
167                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
168                 { "machine",         required_argument, NULL, 'M'                 },
169                 { "slice",           required_argument, NULL, 'S'                 },
170                 { NULL,              0,                 NULL, 0                   }
171         };
172
173         int c, r;
174
175         assert(argc >= 0);
176         assert(argv);
177
178         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
179
180                 switch (c) {
181
182                 case 'h':
183                         help();
184                         return 0;
185
186                 case ARG_VERSION:
187                         puts(PACKAGE_STRING);
188                         puts(SYSTEMD_FEATURES);
189                         return 0;
190
191                 case 'D':
192                         free(arg_directory);
193                         arg_directory = canonicalize_file_name(optarg);
194                         if (!arg_directory) {
195                                 log_error("Failed to canonicalize root directory.");
196                                 return -ENOMEM;
197                         }
198
199                         break;
200
201                 case 'u':
202                         free(arg_user);
203                         arg_user = strdup(optarg);
204                         if (!arg_user)
205                                 return log_oom();
206
207                         break;
208
209                 case ARG_PRIVATE_NETWORK:
210                         arg_private_network = true;
211                         break;
212
213                 case 'b':
214                         arg_boot = true;
215                         break;
216
217                 case ARG_UUID:
218                         r = sd_id128_from_string(optarg, &arg_uuid);
219                         if (r < 0) {
220                                 log_error("Invalid UUID: %s", optarg);
221                                 return r;
222                         }
223                         break;
224
225                 case 'S':
226                         arg_slice = strdup(optarg);
227                         break;
228
229                 case 'M':
230                         if (!hostname_is_valid(optarg)) {
231                                 log_error("Invalid machine name: %s", optarg);
232                                 return -EINVAL;
233                         }
234
235                         free(arg_machine);
236                         arg_machine = strdup(optarg);
237                         if (!arg_machine)
238                                 return log_oom();
239
240                         break;
241
242                 case ARG_READ_ONLY:
243                         arg_read_only = true;
244                         break;
245
246                 case ARG_CAPABILITY: {
247                         char *state, *word;
248                         size_t length;
249
250                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
251                                 cap_value_t cap;
252                                 char *t;
253
254                                 t = strndup(word, length);
255                                 if (!t)
256                                         return log_oom();
257
258                                 if (cap_from_name(t, &cap) < 0) {
259                                         log_error("Failed to parse capability %s.", t);
260                                         free(t);
261                                         return -EINVAL;
262                                 }
263
264                                 free(t);
265                                 arg_retain |= 1ULL << (uint64_t) cap;
266                         }
267
268                         break;
269                 }
270
271                 case 'j':
272                         arg_link_journal = LINK_GUEST;
273                         break;
274
275                 case ARG_LINK_JOURNAL:
276                         if (streq(optarg, "auto"))
277                                 arg_link_journal = LINK_AUTO;
278                         else if (streq(optarg, "no"))
279                                 arg_link_journal = LINK_NO;
280                         else if (streq(optarg, "guest"))
281                                 arg_link_journal = LINK_GUEST;
282                         else if (streq(optarg, "host"))
283                                 arg_link_journal = LINK_HOST;
284                         else {
285                                 log_error("Failed to parse link journal mode %s", optarg);
286                                 return -EINVAL;
287                         }
288
289                         break;
290
291                 case ARG_BIND:
292                 case ARG_BIND_RO: {
293                         _cleanup_free_ char *a = NULL, *b = NULL;
294                         char *e;
295                         char ***x;
296
297                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299                         e = strchr(optarg, ':');
300                         if (e) {
301                                 a = strndup(optarg, e - optarg);
302                                 b = strdup(e + 1);
303                         } else {
304                                 a = strdup(optarg);
305                                 b = strdup(optarg);
306                         }
307
308                         if (!a || !b)
309                                 return log_oom();
310
311                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
312                                 log_error("Invalid bind mount specification: %s", optarg);
313                                 return -EINVAL;
314                         }
315
316                         r = strv_extend(x, a);
317                         if (r < 0)
318                                 return r;
319
320                         r = strv_extend(x, b);
321                         if (r < 0)
322                                 return r;
323
324                         break;
325                 }
326
327                 case '?':
328                         return -EINVAL;
329
330                 default:
331                         log_error("Unknown option code %c", c);
332                         return -EINVAL;
333                 }
334         }
335
336         return 1;
337 }
338
339 static int mount_all(const char *dest) {
340
341         typedef struct MountPoint {
342                 const char *what;
343                 const char *where;
344                 const char *type;
345                 const char *options;
346                 unsigned long flags;
347                 bool fatal;
348         } MountPoint;
349
350         static const MountPoint mount_table[] = {
351                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
352                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
353                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
354                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
355                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
356                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
359 #ifdef HAVE_SELINUX
360                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
361                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
362 #endif
363         };
364
365         unsigned k;
366         int r = 0;
367
368         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369                 _cleanup_free_ char *where = NULL;
370                 int t;
371
372                 where = strjoin(dest, "/", mount_table[k].where, NULL);
373                 if (!where)
374                         return log_oom();
375
376                 t = path_is_mount_point(where, true);
377                 if (t < 0) {
378                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
379
380                         if (r == 0)
381                                 r = t;
382
383                         continue;
384                 }
385
386                 /* Skip this entry if it is not a remount. */
387                 if (mount_table[k].what && t > 0)
388                         continue;
389
390                 mkdir_p(where, 0755);
391
392                 if (mount(mount_table[k].what,
393                           where,
394                           mount_table[k].type,
395                           mount_table[k].flags,
396                           mount_table[k].options) < 0 &&
397                     mount_table[k].fatal) {
398
399                         log_error("mount(%s) failed: %m", where);
400
401                         if (r == 0)
402                                 r = -errno;
403                 }
404         }
405
406         return r;
407 }
408
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
410         char **x, **y;
411
412         STRV_FOREACH_PAIR(x, y, l) {
413                 _cleanup_free_ char *where = NULL;
414
415                 where = strjoin(dest, "/", *y, NULL);
416                 if (!where)
417                         return log_oom();
418
419                 mkdir_p_label(where, 0755);
420
421                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
422                         log_error("mount(%s) failed: %m", where);
423                         return -errno;
424                 }
425
426                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
427                         log_error("mount(%s) failed: %m", where);
428                         return -errno;
429                 }
430         }
431
432         return 0;
433 }
434
435 static int setup_timezone(const char *dest) {
436         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
437         char *z, *y;
438         int r;
439
440         assert(dest);
441
442         /* Fix the timezone, if possible */
443         r = readlink_malloc("/etc/localtime", &p);
444         if (r < 0) {
445                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
446                 return 0;
447         }
448
449         z = path_startswith(p, "../usr/share/zoneinfo/");
450         if (!z)
451                 z = path_startswith(p, "/usr/share/zoneinfo/");
452         if (!z) {
453                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
454                 return 0;
455         }
456
457         where = strappend(dest, "/etc/localtime");
458         if (!where)
459                 return log_oom();
460
461         r = readlink_malloc(where, &q);
462         if (r >= 0) {
463                 y = path_startswith(q, "../usr/share/zoneinfo/");
464                 if (!y)
465                         y = path_startswith(q, "/usr/share/zoneinfo/");
466
467
468                 /* Already pointing to the right place? Then do nothing .. */
469                 if (y && streq(y, z))
470                         return 0;
471         }
472
473         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
474         if (!check)
475                 return log_oom();
476
477         if (access(check, F_OK) < 0) {
478                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
479                 return 0;
480         }
481
482         what = strappend("../usr/share/zoneinfo/", z);
483         if (!what)
484                 return log_oom();
485
486         unlink(where);
487         if (symlink(what, where) < 0) {
488                 log_error("Failed to correct timezone of container: %m");
489                 return 0;
490         }
491
492         return 0;
493 }
494
495 static int setup_resolv_conf(const char *dest) {
496         char _cleanup_free_ *where = NULL;
497         _cleanup_close_ int fd = -1;
498
499         assert(dest);
500
501         if (arg_private_network)
502                 return 0;
503
504         /* Fix resolv.conf, if possible */
505         where = strappend(dest, "/etc/resolv.conf");
506         if (!where)
507                 return log_oom();
508
509         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
510
511         /* We don't really care for the results of this really. If it
512          * fails, it fails, but meh... */
513         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
514                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
515         else
516                 if (mount("/etc/resolv.conf", where, "bind",
517                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
518                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
519                         return -errno;
520                 }
521
522         return 0;
523 }
524
525 static int setup_boot_id(const char *dest) {
526         _cleanup_free_ char *from = NULL, *to = NULL;
527         sd_id128_t rnd;
528         char as_uuid[37];
529         int r;
530
531         assert(dest);
532
533         /* Generate a new randomized boot ID, so that each boot-up of
534          * the container gets a new one */
535
536         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
537         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
538         if (!from || !to)
539                 return log_oom();
540
541         r = sd_id128_randomize(&rnd);
542         if (r < 0) {
543                 log_error("Failed to generate random boot id: %s", strerror(-r));
544                 return r;
545         }
546
547         snprintf(as_uuid, sizeof(as_uuid),
548                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
549                  SD_ID128_FORMAT_VAL(rnd));
550         char_array_0(as_uuid);
551
552         r = write_string_file(from, as_uuid);
553         if (r < 0) {
554                 log_error("Failed to write boot id: %s", strerror(-r));
555                 return r;
556         }
557
558         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
559                 log_error("Failed to bind mount boot id: %m");
560                 r = -errno;
561         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
562                 log_warning("Failed to make boot id read-only: %m");
563
564         unlink(from);
565         return r;
566 }
567
568 static int copy_devnodes(const char *dest) {
569
570         static const char devnodes[] =
571                 "null\0"
572                 "zero\0"
573                 "full\0"
574                 "random\0"
575                 "urandom\0"
576                 "tty\0";
577
578         const char *d;
579         int r = 0;
580         _cleanup_umask_ mode_t u;
581
582         assert(dest);
583
584         u = umask(0000);
585
586         NULSTR_FOREACH(d, devnodes) {
587                 struct stat st;
588                 _cleanup_free_ char *from = NULL, *to = NULL;
589
590                 asprintf(&from, "/dev/%s", d);
591                 asprintf(&to, "%s/dev/%s", dest, d);
592
593                 if (!from || !to) {
594                         log_oom();
595
596                         if (r == 0)
597                                 r = -ENOMEM;
598
599                         break;
600                 }
601
602                 if (stat(from, &st) < 0) {
603
604                         if (errno != ENOENT) {
605                                 log_error("Failed to stat %s: %m", from);
606                                 if (r == 0)
607                                         r = -errno;
608                         }
609
610                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
611
612                         log_error("%s is not a char or block device, cannot copy", from);
613                         if (r == 0)
614                                 r = -EIO;
615
616                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
617
618                         log_error("mknod(%s) failed: %m", dest);
619                         if (r == 0)
620                                 r = -errno;
621                 }
622         }
623
624         return r;
625 }
626
627 static int setup_ptmx(const char *dest) {
628         _cleanup_free_ char *p = NULL;
629
630         p = strappend(dest, "/dev/ptmx");
631         if (!p)
632                 return log_oom();
633
634         if (symlink("pts/ptmx", p) < 0) {
635                 log_error("Failed to create /dev/ptmx symlink: %m");
636                 return -errno;
637         }
638
639         return 0;
640 }
641
642 static int setup_dev_console(const char *dest, const char *console) {
643         struct stat st;
644         _cleanup_free_ char *to = NULL;
645         int r;
646         _cleanup_umask_ mode_t u;
647
648         assert(dest);
649         assert(console);
650
651         u = umask(0000);
652
653         if (stat(console, &st) < 0) {
654                 log_error("Failed to stat %s: %m", console);
655                 return -errno;
656
657         } else if (!S_ISCHR(st.st_mode)) {
658                 log_error("/dev/console is not a char device");
659                 return -EIO;
660         }
661
662         r = chmod_and_chown(console, 0600, 0, 0);
663         if (r < 0) {
664                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
665                 return r;
666         }
667
668         if (asprintf(&to, "%s/dev/console", dest) < 0)
669                 return log_oom();
670
671         /* We need to bind mount the right tty to /dev/console since
672          * ptys can only exist on pts file systems. To have something
673          * to bind mount things on we create a device node first, that
674          * has the right major/minor (note that the major minor
675          * doesn't actually matter here, since we mount it over
676          * anyway). */
677
678         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
679                 log_error("mknod() for /dev/console failed: %m");
680                 return -errno;
681         }
682
683         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
684                 log_error("Bind mount for /dev/console failed: %m");
685                 return -errno;
686         }
687
688         return 0;
689 }
690
691 static int setup_kmsg(const char *dest, int kmsg_socket) {
692         _cleanup_free_ char *from = NULL, *to = NULL;
693         int r, fd, k;
694         _cleanup_umask_ mode_t u;
695         union {
696                 struct cmsghdr cmsghdr;
697                 uint8_t buf[CMSG_SPACE(sizeof(int))];
698         } control = {};
699         struct msghdr mh = {
700                 .msg_control = &control,
701                 .msg_controllen = sizeof(control),
702         };
703         struct cmsghdr *cmsg;
704
705         assert(dest);
706         assert(kmsg_socket >= 0);
707
708         u = umask(0000);
709
710         /* We create the kmsg FIFO as /dev/kmsg, but immediately
711          * delete it after bind mounting it to /proc/kmsg. While FIFOs
712          * on the reading side behave very similar to /proc/kmsg,
713          * their writing side behaves differently from /dev/kmsg in
714          * that writing blocks when nothing is reading. In order to
715          * avoid any problems with containers deadlocking due to this
716          * we simply make /dev/kmsg unavailable to the container. */
717         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
718             asprintf(&to, "%s/proc/kmsg", dest) < 0)
719                 return log_oom();
720
721         if (mkfifo(from, 0600) < 0) {
722                 log_error("mkfifo() for /dev/kmsg failed: %m");
723                 return -errno;
724         }
725
726         r = chmod_and_chown(from, 0600, 0, 0);
727         if (r < 0) {
728                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
729                 return r;
730         }
731
732         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
733                 log_error("Bind mount for /proc/kmsg failed: %m");
734                 return -errno;
735         }
736
737         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
738         if (fd < 0) {
739                 log_error("Failed to open fifo: %m");
740                 return -errno;
741         }
742
743         cmsg = CMSG_FIRSTHDR(&mh);
744         cmsg->cmsg_level = SOL_SOCKET;
745         cmsg->cmsg_type = SCM_RIGHTS;
746         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
747         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
748
749         mh.msg_controllen = cmsg->cmsg_len;
750
751         /* Store away the fd in the socket, so that it stays open as
752          * long as we run the child */
753         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
754         close_nointr_nofail(fd);
755
756         if (k < 0) {
757                 log_error("Failed to send FIFO fd: %m");
758                 return -errno;
759         }
760
761         /* And now make the FIFO unavailable as /dev/kmsg... */
762         unlink(from);
763         return 0;
764 }
765
766 static int setup_hostname(void) {
767
768         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
769                 return -errno;
770
771         return 0;
772 }
773
774 static int setup_journal(const char *directory) {
775         sd_id128_t machine_id;
776         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
777         char *id;
778         int r;
779
780         if (arg_link_journal == LINK_NO)
781                 return 0;
782
783         p = strappend(directory, "/etc/machine-id");
784         if (!p)
785                 return log_oom();
786
787         r = read_one_line_file(p, &b);
788         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
789                 return 0;
790         else if (r < 0) {
791                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
792                 return r;
793         }
794
795         id = strstrip(b);
796         if (isempty(id) && arg_link_journal == LINK_AUTO)
797                 return 0;
798
799         /* Verify validity */
800         r = sd_id128_from_string(id, &machine_id);
801         if (r < 0) {
802                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
803                 return r;
804         }
805
806         free(p);
807         p = strappend("/var/log/journal/", id);
808         q = strjoin(directory, "/var/log/journal/", id, NULL);
809         if (!p || !q)
810                 return log_oom();
811
812         if (path_is_mount_point(p, false) > 0) {
813                 if (arg_link_journal != LINK_AUTO) {
814                         log_error("%s: already a mount point, refusing to use for journal", p);
815                         return -EEXIST;
816                 }
817
818                 return 0;
819         }
820
821         if (path_is_mount_point(q, false) > 0) {
822                 if (arg_link_journal != LINK_AUTO) {
823                         log_error("%s: already a mount point, refusing to use for journal", q);
824                         return -EEXIST;
825                 }
826
827                 return 0;
828         }
829
830         r = readlink_and_make_absolute(p, &d);
831         if (r >= 0) {
832                 if ((arg_link_journal == LINK_GUEST ||
833                      arg_link_journal == LINK_AUTO) &&
834                     path_equal(d, q)) {
835
836                         r = mkdir_p(q, 0755);
837                         if (r < 0)
838                                 log_warning("failed to create directory %s: %m", q);
839                         return 0;
840                 }
841
842                 if (unlink(p) < 0) {
843                         log_error("Failed to remove symlink %s: %m", p);
844                         return -errno;
845                 }
846         } else if (r == -EINVAL) {
847
848                 if (arg_link_journal == LINK_GUEST &&
849                     rmdir(p) < 0) {
850
851                         if (errno == ENOTDIR) {
852                                 log_error("%s already exists and is neither a symlink nor a directory", p);
853                                 return r;
854                         } else {
855                                 log_error("Failed to remove %s: %m", p);
856                                 return -errno;
857                         }
858                 }
859         } else if (r != -ENOENT) {
860                 log_error("readlink(%s) failed: %m", p);
861                 return r;
862         }
863
864         if (arg_link_journal == LINK_GUEST) {
865
866                 if (symlink(q, p) < 0) {
867                         log_error("Failed to symlink %s to %s: %m", q, p);
868                         return -errno;
869                 }
870
871                 r = mkdir_p(q, 0755);
872                 if (r < 0)
873                         log_warning("failed to create directory %s: %m", q);
874                 return 0;
875         }
876
877         if (arg_link_journal == LINK_HOST) {
878                 r = mkdir_p(p, 0755);
879                 if (r < 0) {
880                         log_error("Failed to create %s: %m", p);
881                         return r;
882                 }
883
884         } else if (access(p, F_OK) < 0)
885                 return 0;
886
887         if (dir_is_empty(q) == 0) {
888                 log_error("%s not empty.", q);
889                 return -ENOTEMPTY;
890         }
891
892         r = mkdir_p(q, 0755);
893         if (r < 0) {
894                 log_error("Failed to create %s: %m", q);
895                 return r;
896         }
897
898         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
899                 log_error("Failed to bind mount journal from host into guest: %m");
900                 return -errno;
901         }
902
903         return 0;
904 }
905
906 static int drop_capabilities(void) {
907         return capability_bounding_set_drop(~arg_retain, false);
908 }
909
910 static int process_pty(int master, pid_t pid, sigset_t *mask) {
911
912         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
913         size_t in_buffer_full = 0, out_buffer_full = 0;
914         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
915         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
916         int ep = -1, signal_fd = -1, r;
917         bool tried_orderly_shutdown = false;
918
919         assert(master >= 0);
920         assert(pid > 0);
921         assert(mask);
922
923         fd_nonblock(STDIN_FILENO, 1);
924         fd_nonblock(STDOUT_FILENO, 1);
925         fd_nonblock(master, 1);
926
927         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
928         if (signal_fd < 0) {
929                 log_error("signalfd(): %m");
930                 r = -errno;
931                 goto finish;
932         }
933
934         ep = epoll_create1(EPOLL_CLOEXEC);
935         if (ep < 0) {
936                 log_error("Failed to create epoll: %m");
937                 r = -errno;
938                 goto finish;
939         }
940
941         /* We read from STDIN only if this is actually a TTY,
942          * otherwise we assume non-interactivity. */
943         if (isatty(STDIN_FILENO)) {
944                 zero(stdin_ev);
945                 stdin_ev.events = EPOLLIN|EPOLLET;
946                 stdin_ev.data.fd = STDIN_FILENO;
947
948                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
949                         log_error("Failed to register STDIN in epoll: %m");
950                         r = -errno;
951                         goto finish;
952                 }
953         }
954
955         zero(stdout_ev);
956         stdout_ev.events = EPOLLOUT|EPOLLET;
957         stdout_ev.data.fd = STDOUT_FILENO;
958
959         zero(master_ev);
960         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
961         master_ev.data.fd = master;
962
963         zero(signal_ev);
964         signal_ev.events = EPOLLIN;
965         signal_ev.data.fd = signal_fd;
966
967         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
968                 if (errno != EPERM) {
969                         log_error("Failed to register stdout in epoll: %m");
970                         r = -errno;
971                         goto finish;
972                 }
973                 /* stdout without epoll support. Likely redirected to regular file. */
974                 stdout_writable = true;
975         }
976
977         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
978             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
979                 log_error("Failed to register fds in epoll: %m");
980                 r = -errno;
981                 goto finish;
982         }
983
984         for (;;) {
985                 struct epoll_event ev[16];
986                 ssize_t k;
987                 int i, nfds;
988
989                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
990                 if (nfds < 0) {
991
992                         if (errno == EINTR || errno == EAGAIN)
993                                 continue;
994
995                         log_error("epoll_wait(): %m");
996                         r = -errno;
997                         goto finish;
998                 }
999
1000                 assert(nfds >= 1);
1001
1002                 for (i = 0; i < nfds; i++) {
1003                         if (ev[i].data.fd == STDIN_FILENO) {
1004
1005                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1006                                         stdin_readable = true;
1007
1008                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1009
1010                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1011                                         stdout_writable = true;
1012
1013                         } else if (ev[i].data.fd == master) {
1014
1015                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1016                                         master_readable = true;
1017
1018                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1019                                         master_writable = true;
1020
1021                         } else if (ev[i].data.fd == signal_fd) {
1022                                 struct signalfd_siginfo sfsi;
1023                                 ssize_t n;
1024
1025                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1026                                 if (n != sizeof(sfsi)) {
1027
1028                                         if (n >= 0) {
1029                                                 log_error("Failed to read from signalfd: invalid block size");
1030                                                 r = -EIO;
1031                                                 goto finish;
1032                                         }
1033
1034                                         if (errno != EINTR && errno != EAGAIN) {
1035                                                 log_error("Failed to read from signalfd: %m");
1036                                                 r = -errno;
1037                                                 goto finish;
1038                                         }
1039                                 } else {
1040
1041                                         if (sfsi.ssi_signo == SIGWINCH) {
1042                                                 struct winsize ws;
1043
1044                                                 /* The window size changed, let's forward that. */
1045                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1046                                                         ioctl(master, TIOCSWINSZ, &ws);
1047                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1048
1049                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1050
1051                                                 /* This only works for systemd... */
1052                                                 tried_orderly_shutdown = true;
1053                                                 kill(pid, SIGRTMIN+3);
1054
1055                                         } else {
1056                                                 r = 0;
1057                                                 goto finish;
1058                                         }
1059                                 }
1060                         }
1061                 }
1062
1063                 while ((stdin_readable && in_buffer_full <= 0) ||
1064                        (master_writable && in_buffer_full > 0) ||
1065                        (master_readable && out_buffer_full <= 0) ||
1066                        (stdout_writable && out_buffer_full > 0)) {
1067
1068                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1069
1070                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1071                                 if (k < 0) {
1072
1073                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1074                                                 stdin_readable = false;
1075                                         else {
1076                                                 log_error("read(): %m");
1077                                                 r = -errno;
1078                                                 goto finish;
1079                                         }
1080                                 } else
1081                                         in_buffer_full += (size_t) k;
1082                         }
1083
1084                         if (master_writable && in_buffer_full > 0) {
1085
1086                                 k = write(master, in_buffer, in_buffer_full);
1087                                 if (k < 0) {
1088
1089                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1090                                                 master_writable = false;
1091                                         else {
1092                                                 log_error("write(): %m");
1093                                                 r = -errno;
1094                                                 goto finish;
1095                                         }
1096
1097                                 } else {
1098                                         assert(in_buffer_full >= (size_t) k);
1099                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1100                                         in_buffer_full -= k;
1101                                 }
1102                         }
1103
1104                         if (master_readable && out_buffer_full < LINE_MAX) {
1105
1106                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1107                                 if (k < 0) {
1108
1109                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1110                                                 master_readable = false;
1111                                         else {
1112                                                 log_error("read(): %m");
1113                                                 r = -errno;
1114                                                 goto finish;
1115                                         }
1116                                 }  else
1117                                         out_buffer_full += (size_t) k;
1118                         }
1119
1120                         if (stdout_writable && out_buffer_full > 0) {
1121
1122                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1123                                 if (k < 0) {
1124
1125                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1126                                                 stdout_writable = false;
1127                                         else {
1128                                                 log_error("write(): %m");
1129                                                 r = -errno;
1130                                                 goto finish;
1131                                         }
1132
1133                                 } else {
1134                                         assert(out_buffer_full >= (size_t) k);
1135                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1136                                         out_buffer_full -= k;
1137                                 }
1138                         }
1139                 }
1140         }
1141
1142 finish:
1143         if (ep >= 0)
1144                 close_nointr_nofail(ep);
1145
1146         if (signal_fd >= 0)
1147                 close_nointr_nofail(signal_fd);
1148
1149         return r;
1150 }
1151
1152 static int register_machine(void) {
1153         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1154         _cleanup_bus_unref_ sd_bus *bus = NULL;
1155         int r;
1156
1157         r = sd_bus_open_system(&bus);
1158         if (r < 0) {
1159                 log_error("Failed to open system bus: %s", strerror(-r));
1160                 return r;
1161         }
1162
1163         r = sd_bus_call_method(
1164                         bus,
1165                         "org.freedesktop.machine1",
1166                         "/org/freedesktop/machine1",
1167                         "org.freedesktop.machine1.Manager",
1168                         "CreateMachine",
1169                         &error,
1170                         NULL,
1171                         "sayssuss",
1172                         arg_machine,
1173                         SD_BUS_APPEND_ID128(arg_uuid),
1174                         "nspawn",
1175                         "container",
1176                         (uint32_t) 0,
1177                         strempty(arg_slice),
1178                         strempty(arg_directory));
1179         if (r < 0) {
1180                 log_error("Failed to register machine: %s", error.message);
1181                 return r;
1182         }
1183
1184         return 0;
1185 }
1186
1187 static bool audit_enabled(void) {
1188         int fd;
1189
1190         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1191         if (fd >= 0) {
1192                 close_nointr_nofail(fd);
1193                 return true;
1194         }
1195         return false;
1196 }
1197
1198 int main(int argc, char *argv[]) {
1199         pid_t pid = 0;
1200         int r = EXIT_FAILURE, k;
1201         _cleanup_close_ int master = -1;
1202         int n_fd_passed;
1203         const char *console = NULL;
1204         struct termios saved_attr, raw_attr;
1205         sigset_t mask;
1206         bool saved_attr_valid = false;
1207         struct winsize ws;
1208         int kmsg_socket_pair[2] = { -1, -1 };
1209         FDSet *fds = NULL;
1210
1211         log_parse_environment();
1212         log_open();
1213
1214         k = parse_argv(argc, argv);
1215         if (k < 0)
1216                 goto finish;
1217         else if (k == 0) {
1218                 r = EXIT_SUCCESS;
1219                 goto finish;
1220         }
1221
1222         if (arg_directory) {
1223                 char *p;
1224
1225                 p = path_make_absolute_cwd(arg_directory);
1226                 free(arg_directory);
1227                 arg_directory = p;
1228         } else
1229                 arg_directory = get_current_dir_name();
1230
1231         if (!arg_directory) {
1232                 log_error("Failed to determine path, please use -D.");
1233                 goto finish;
1234         }
1235
1236         path_kill_slashes(arg_directory);
1237
1238         if (!arg_machine) {
1239                 arg_machine = strdup(path_get_file_name(arg_directory));
1240                 if (!arg_machine) {
1241                         log_oom();
1242                         goto finish;
1243                 }
1244
1245                 hostname_cleanup(arg_machine, false);
1246                 if (isempty(arg_machine)) {
1247                         log_error("Failed to determine machine name automatically, please use -M.");
1248                         goto finish;
1249                 }
1250         }
1251
1252         if (geteuid() != 0) {
1253                 log_error("Need to be root.");
1254                 goto finish;
1255         }
1256
1257         if (sd_booted() <= 0) {
1258                 log_error("Not running on a systemd system.");
1259                 goto finish;
1260         }
1261
1262         if (arg_boot && audit_enabled()) {
1263                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1264                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1265                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1266                 sleep(5);
1267         }
1268
1269         if (path_equal(arg_directory, "/")) {
1270                 log_error("Spawning container on root directory not supported.");
1271                 goto finish;
1272         }
1273
1274         if (path_is_os_tree(arg_directory) <= 0) {
1275                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1276                 goto finish;
1277         }
1278
1279         log_close();
1280         n_fd_passed = sd_listen_fds(false);
1281         if (n_fd_passed > 0) {
1282                 k = fdset_new_listen_fds(&fds, false);
1283                 if (k < 0) {
1284                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1285                         goto finish;
1286                 }
1287         }
1288         fdset_close_others(fds);
1289         log_open();
1290
1291         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1292         if (master < 0) {
1293                 log_error("Failed to acquire pseudo tty: %m");
1294                 goto finish;
1295         }
1296
1297         console = ptsname(master);
1298         if (!console) {
1299                 log_error("Failed to determine tty name: %m");
1300                 goto finish;
1301         }
1302
1303         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1304
1305         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1306                 ioctl(master, TIOCSWINSZ, &ws);
1307
1308         if (unlockpt(master) < 0) {
1309                 log_error("Failed to unlock tty: %m");
1310                 goto finish;
1311         }
1312
1313         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1314                 saved_attr_valid = true;
1315
1316                 raw_attr = saved_attr;
1317                 cfmakeraw(&raw_attr);
1318                 raw_attr.c_lflag &= ~ECHO;
1319         }
1320
1321         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1322                 log_error("Failed to create kmsg socket pair.");
1323                 goto finish;
1324         }
1325
1326         sd_notify(0, "READY=1");
1327
1328         assert_se(sigemptyset(&mask) == 0);
1329         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1330         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1331
1332         for (;;) {
1333                 siginfo_t status;
1334                 int pipefd[2], pipefd2[2];
1335
1336                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1337                         log_error("pipe2(): %m");
1338                         goto finish;
1339                 }
1340
1341                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1342                         log_error("pipe2(): %m");
1343                         close_pipe(pipefd);
1344                         goto finish;
1345                 }
1346
1347                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1348                 if (pid < 0) {
1349                         if (errno == EINVAL)
1350                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1351                         else
1352                                 log_error("clone() failed: %m");
1353
1354                         goto finish;
1355                 }
1356
1357                 if (pid == 0) {
1358                         /* child */
1359                         const char *home = NULL;
1360                         uid_t uid = (uid_t) -1;
1361                         gid_t gid = (gid_t) -1;
1362                         unsigned n_env = 2;
1363                         const char *envp[] = {
1364                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1365                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1366                                 NULL, /* TERM */
1367                                 NULL, /* HOME */
1368                                 NULL, /* USER */
1369                                 NULL, /* LOGNAME */
1370                                 NULL, /* container_uuid */
1371                                 NULL, /* LISTEN_FDS */
1372                                 NULL, /* LISTEN_PID */
1373                                 NULL
1374                         };
1375
1376                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1377                         if (envp[n_env])
1378                                 n_env ++;
1379
1380                         /* Wait for the parent process to log our PID */
1381                         close_nointr_nofail(pipefd[1]);
1382                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1383                         close_nointr_nofail(pipefd[0]);
1384
1385                         close_nointr_nofail(master);
1386                         master = -1;
1387
1388                         if (saved_attr_valid) {
1389                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1390                                         log_error("Failed to set terminal attributes: %m");
1391                                         goto child_fail;
1392                                 }
1393                         }
1394
1395                         close_nointr(STDIN_FILENO);
1396                         close_nointr(STDOUT_FILENO);
1397                         close_nointr(STDERR_FILENO);
1398
1399                         close_nointr_nofail(kmsg_socket_pair[0]);
1400                         kmsg_socket_pair[0] = -1;
1401
1402                         reset_all_signal_handlers();
1403
1404                         assert_se(sigemptyset(&mask) == 0);
1405                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1406
1407                         k = open_terminal(console, O_RDWR);
1408                         if (k != STDIN_FILENO) {
1409                                 if (k >= 0) {
1410                                         close_nointr_nofail(k);
1411                                         k = -EINVAL;
1412                                 }
1413
1414                                 log_error("Failed to open console: %s", strerror(-k));
1415                                 goto child_fail;
1416                         }
1417
1418                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1419                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1420                                 log_error("Failed to duplicate console: %m");
1421                                 goto child_fail;
1422                         }
1423
1424                         if (setsid() < 0) {
1425                                 log_error("setsid() failed: %m");
1426                                 goto child_fail;
1427                         }
1428
1429                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1430                                 log_error("PR_SET_PDEATHSIG failed: %m");
1431                                 goto child_fail;
1432                         }
1433
1434                         close_pipe(pipefd2);
1435
1436                         r = register_machine();
1437                         if (r < 0)
1438                                 goto finish;
1439
1440                         /* Mark everything as slave, so that we still
1441                          * receive mounts from the real root, but don't
1442                          * propagate mounts to the real root. */
1443                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1444                                 log_error("MS_SLAVE|MS_REC failed: %m");
1445                                 goto child_fail;
1446                         }
1447
1448                         /* Turn directory into bind mount */
1449                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1450                                 log_error("Failed to make bind mount.");
1451                                 goto child_fail;
1452                         }
1453
1454                         if (arg_read_only)
1455                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1456                                         log_error("Failed to make read-only.");
1457                                         goto child_fail;
1458                                 }
1459
1460                         if (mount_all(arg_directory) < 0)
1461                                 goto child_fail;
1462
1463                         if (copy_devnodes(arg_directory) < 0)
1464                                 goto child_fail;
1465
1466                         if (setup_ptmx(arg_directory) < 0)
1467                                 goto child_fail;
1468
1469                         dev_setup(arg_directory);
1470
1471                         if (setup_dev_console(arg_directory, console) < 0)
1472                                 goto child_fail;
1473
1474                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1475                                 goto child_fail;
1476
1477                         close_nointr_nofail(kmsg_socket_pair[1]);
1478                         kmsg_socket_pair[1] = -1;
1479
1480                         if (setup_boot_id(arg_directory) < 0)
1481                                 goto child_fail;
1482
1483                         if (setup_timezone(arg_directory) < 0)
1484                                 goto child_fail;
1485
1486                         if (setup_resolv_conf(arg_directory) < 0)
1487                                 goto child_fail;
1488
1489                         if (setup_journal(arg_directory) < 0)
1490                                 goto child_fail;
1491
1492                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1493                                 goto child_fail;
1494
1495                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1496                                 goto child_fail;
1497
1498                         if (chdir(arg_directory) < 0) {
1499                                 log_error("chdir(%s) failed: %m", arg_directory);
1500                                 goto child_fail;
1501                         }
1502
1503                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1504                                 log_error("mount(MS_MOVE) failed: %m");
1505                                 goto child_fail;
1506                         }
1507
1508                         if (chroot(".") < 0) {
1509                                 log_error("chroot() failed: %m");
1510                                 goto child_fail;
1511                         }
1512
1513                         if (chdir("/") < 0) {
1514                                 log_error("chdir() failed: %m");
1515                                 goto child_fail;
1516                         }
1517
1518                         umask(0022);
1519
1520                         loopback_setup();
1521
1522                         if (drop_capabilities() < 0) {
1523                                 log_error("drop_capabilities() failed: %m");
1524                                 goto child_fail;
1525                         }
1526
1527                         if (arg_user) {
1528
1529                                 /* Note that this resolves user names
1530                                  * inside the container, and hence
1531                                  * accesses the NSS modules from the
1532                                  * container and not the host. This is
1533                                  * a bit weird... */
1534
1535                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1536                                         log_error("get_user_creds() failed: %m");
1537                                         goto child_fail;
1538                                 }
1539
1540                                 if (mkdir_parents_label(home, 0775) < 0) {
1541                                         log_error("mkdir_parents_label() failed: %m");
1542                                         goto child_fail;
1543                                 }
1544
1545                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1546                                         log_error("mkdir_safe_label() failed: %m");
1547                                         goto child_fail;
1548                                 }
1549
1550                                 if (initgroups((const char*)arg_user, gid) < 0) {
1551                                         log_error("initgroups() failed: %m");
1552                                         goto child_fail;
1553                                 }
1554
1555                                 if (setresgid(gid, gid, gid) < 0) {
1556                                         log_error("setregid() failed: %m");
1557                                         goto child_fail;
1558                                 }
1559
1560                                 if (setresuid(uid, uid, uid) < 0) {
1561                                         log_error("setreuid() failed: %m");
1562                                         goto child_fail;
1563                                 }
1564                         } else {
1565                                 /* Reset everything fully to 0, just in case */
1566
1567                                 if (setgroups(0, NULL) < 0) {
1568                                         log_error("setgroups() failed: %m");
1569                                         goto child_fail;
1570                                 }
1571
1572                                 if (setresgid(0, 0, 0) < 0) {
1573                                         log_error("setregid() failed: %m");
1574                                         goto child_fail;
1575                                 }
1576
1577                                 if (setresuid(0, 0, 0) < 0) {
1578                                         log_error("setreuid() failed: %m");
1579                                         goto child_fail;
1580                                 }
1581                         }
1582
1583                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1584                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1585                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1586                                 log_oom();
1587                                 goto child_fail;
1588                         }
1589
1590                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1591                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1592                                         log_oom();
1593                                         goto child_fail;
1594                                 }
1595                         }
1596
1597                         if (fdset_size(fds) > 0) {
1598                                 k = fdset_cloexec(fds, false);
1599                                 if (k < 0) {
1600                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1601                                         goto child_fail;
1602                                 }
1603
1604                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1605                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1606                                         log_oom();
1607                                         goto child_fail;
1608                                 }
1609                         }
1610
1611                         setup_hostname();
1612
1613                         if (arg_boot) {
1614                                 char **a;
1615                                 size_t l;
1616
1617                                 /* Automatically search for the init system */
1618
1619                                 l = 1 + argc - optind;
1620                                 a = newa(char*, l + 1);
1621                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1622
1623                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1624                                 execve(a[0], a, (char**) envp);
1625
1626                                 a[0] = (char*) "/lib/systemd/systemd";
1627                                 execve(a[0], a, (char**) envp);
1628
1629                                 a[0] = (char*) "/sbin/init";
1630                                 execve(a[0], a, (char**) envp);
1631                         } else if (argc > optind)
1632                                 execvpe(argv[optind], argv + optind, (char**) envp);
1633                         else {
1634                                 chdir(home ? home : "/root");
1635                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1636                         }
1637
1638                         log_error("execv() failed: %m");
1639
1640                 child_fail:
1641                         _exit(EXIT_FAILURE);
1642                 }
1643
1644                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1645                 close_nointr_nofail(pipefd[0]);
1646                 close_nointr_nofail(pipefd[1]);
1647
1648                 /* Wait for the child process to establish cgroup hierarchy */
1649                 close_nointr_nofail(pipefd2[1]);
1650                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1651                 close_nointr_nofail(pipefd2[0]);
1652
1653                 fdset_free(fds);
1654                 fds = NULL;
1655
1656                 if (process_pty(master, pid, &mask) < 0)
1657                         goto finish;
1658
1659                 if (saved_attr_valid)
1660                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1661
1662                 k = wait_for_terminate(pid, &status);
1663                 if (k < 0) {
1664                         r = EXIT_FAILURE;
1665                         break;
1666                 }
1667
1668                 if (status.si_code == CLD_EXITED) {
1669                         r = status.si_status;
1670                         if (status.si_status != 0) {
1671                                 log_error("Container failed with error code %i.", status.si_status);
1672                                 break;
1673                         }
1674
1675                         log_debug("Container exited successfully.");
1676                         break;
1677                 } else if (status.si_code == CLD_KILLED &&
1678                            status.si_status == SIGINT) {
1679                         log_info("Container has been shut down.");
1680                         r = 0;
1681                         break;
1682                 } else if (status.si_code == CLD_KILLED &&
1683                            status.si_status == SIGHUP) {
1684                         log_info("Container is being rebooted.");
1685                         continue;
1686                 } else if (status.si_code == CLD_KILLED ||
1687                            status.si_code == CLD_DUMPED) {
1688
1689                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1690                         r = EXIT_FAILURE;
1691                         break;
1692                 } else {
1693                         log_error("Container failed due to unknown reason.");
1694                         r = EXIT_FAILURE;
1695                         break;
1696                 }
1697         }
1698
1699 finish:
1700         if (saved_attr_valid)
1701                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1702
1703         close_pipe(kmsg_socket_pair);
1704
1705         if (pid > 0)
1706                 kill(pid, SIGKILL);
1707
1708         free(arg_directory);
1709         free(arg_machine);
1710
1711         fdset_free(fds);
1712
1713         return r;
1714 }