chiark / gitweb /
cb2f05c02d0c74c1cce1d4db64dce29840c70605
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
45
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
48
49 #include "log.h"
50 #include "util.h"
51 #include "mkdir.h"
52 #include "macro.h"
53 #include "audit.h"
54 #include "missing.h"
55 #include "cgroup-util.h"
56 #include "strv.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
59 #include "sd-id128.h"
60 #include "dev-setup.h"
61 #include "fdset.h"
62 #include "build.h"
63 #include "fileio.h"
64 #include "bus-util.h"
65 #include "ptyfwd.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72         LINK_NO,
73         LINK_AUTO,
74         LINK_HOST,
75         LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88         (1ULL << CAP_CHOWN) |
89         (1ULL << CAP_DAC_OVERRIDE) |
90         (1ULL << CAP_DAC_READ_SEARCH) |
91         (1ULL << CAP_FOWNER) |
92         (1ULL << CAP_FSETID) |
93         (1ULL << CAP_IPC_OWNER) |
94         (1ULL << CAP_KILL) |
95         (1ULL << CAP_LEASE) |
96         (1ULL << CAP_LINUX_IMMUTABLE) |
97         (1ULL << CAP_NET_BIND_SERVICE) |
98         (1ULL << CAP_NET_BROADCAST) |
99         (1ULL << CAP_NET_RAW) |
100         (1ULL << CAP_SETGID) |
101         (1ULL << CAP_SETFCAP) |
102         (1ULL << CAP_SETPCAP) |
103         (1ULL << CAP_SETUID) |
104         (1ULL << CAP_SYS_ADMIN) |
105         (1ULL << CAP_SYS_CHROOT) |
106         (1ULL << CAP_SYS_NICE) |
107         (1ULL << CAP_SYS_PTRACE) |
108         (1ULL << CAP_SYS_TTY_CONFIG) |
109         (1ULL << CAP_SYS_RESOURCE) |
110         (1ULL << CAP_SYS_BOOT) |
111         (1ULL << CAP_AUDIT_WRITE) |
112         (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120                "  -h --help                Show this help\n"
121                "     --version             Print version string\n"
122                "  -D --directory=NAME      Root directory for the container\n"
123                "  -b --boot                Boot up full system (i.e. invoke init)\n"
124                "  -u --user=USER           Run the command under specified user or uid\n"
125                "     --uuid=UUID           Set a specific machine UUID for the container\n"
126                "  -M --machine=NAME        Set the machine name for the container\n"
127                "  -S --slice=SLICE         Place the container in the specified slice\n"
128                "     --private-network     Disable network in container\n"
129                "     --read-only           Mount the root directory read-only\n"
130                "     --capability=CAP      In addition to the default, retain specified\n"
131                "                           capability\n"
132                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
133                "  -j                       Equivalent to --link-journal=host\n"
134                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
135                "                           the container\n"
136                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137                program_invocation_short_name);
138
139         return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144         enum {
145                 ARG_VERSION = 0x100,
146                 ARG_PRIVATE_NETWORK,
147                 ARG_UUID,
148                 ARG_READ_ONLY,
149                 ARG_CAPABILITY,
150                 ARG_LINK_JOURNAL,
151                 ARG_BIND,
152                 ARG_BIND_RO
153         };
154
155         static const struct option options[] = {
156                 { "help",            no_argument,       NULL, 'h'                 },
157                 { "version",         no_argument,       NULL, ARG_VERSION         },
158                 { "directory",       required_argument, NULL, 'D'                 },
159                 { "user",            required_argument, NULL, 'u'                 },
160                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
161                 { "boot",            no_argument,       NULL, 'b'                 },
162                 { "uuid",            required_argument, NULL, ARG_UUID            },
163                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
164                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
165                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
166                 { "bind",            required_argument, NULL, ARG_BIND            },
167                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
168                 { "machine",         required_argument, NULL, 'M'                 },
169                 { "slice",           required_argument, NULL, 'S'                 },
170                 { NULL,              0,                 NULL, 0                   }
171         };
172
173         int c, r;
174
175         assert(argc >= 0);
176         assert(argv);
177
178         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
179
180                 switch (c) {
181
182                 case 'h':
183                         help();
184                         return 0;
185
186                 case ARG_VERSION:
187                         puts(PACKAGE_STRING);
188                         puts(SYSTEMD_FEATURES);
189                         return 0;
190
191                 case 'D':
192                         free(arg_directory);
193                         arg_directory = canonicalize_file_name(optarg);
194                         if (!arg_directory) {
195                                 log_error("Failed to canonicalize root directory.");
196                                 return -ENOMEM;
197                         }
198
199                         break;
200
201                 case 'u':
202                         free(arg_user);
203                         arg_user = strdup(optarg);
204                         if (!arg_user)
205                                 return log_oom();
206
207                         break;
208
209                 case ARG_PRIVATE_NETWORK:
210                         arg_private_network = true;
211                         break;
212
213                 case 'b':
214                         arg_boot = true;
215                         break;
216
217                 case ARG_UUID:
218                         r = sd_id128_from_string(optarg, &arg_uuid);
219                         if (r < 0) {
220                                 log_error("Invalid UUID: %s", optarg);
221                                 return r;
222                         }
223                         break;
224
225                 case 'S':
226                         arg_slice = strdup(optarg);
227                         break;
228
229                 case 'M':
230                         if (!hostname_is_valid(optarg)) {
231                                 log_error("Invalid machine name: %s", optarg);
232                                 return -EINVAL;
233                         }
234
235                         free(arg_machine);
236                         arg_machine = strdup(optarg);
237                         if (!arg_machine)
238                                 return log_oom();
239
240                         break;
241
242                 case ARG_READ_ONLY:
243                         arg_read_only = true;
244                         break;
245
246                 case ARG_CAPABILITY: {
247                         char *state, *word;
248                         size_t length;
249
250                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
251                                 cap_value_t cap;
252                                 char *t;
253
254                                 t = strndup(word, length);
255                                 if (!t)
256                                         return log_oom();
257
258                                 if (cap_from_name(t, &cap) < 0) {
259                                         log_error("Failed to parse capability %s.", t);
260                                         free(t);
261                                         return -EINVAL;
262                                 }
263
264                                 free(t);
265                                 arg_retain |= 1ULL << (uint64_t) cap;
266                         }
267
268                         break;
269                 }
270
271                 case 'j':
272                         arg_link_journal = LINK_GUEST;
273                         break;
274
275                 case ARG_LINK_JOURNAL:
276                         if (streq(optarg, "auto"))
277                                 arg_link_journal = LINK_AUTO;
278                         else if (streq(optarg, "no"))
279                                 arg_link_journal = LINK_NO;
280                         else if (streq(optarg, "guest"))
281                                 arg_link_journal = LINK_GUEST;
282                         else if (streq(optarg, "host"))
283                                 arg_link_journal = LINK_HOST;
284                         else {
285                                 log_error("Failed to parse link journal mode %s", optarg);
286                                 return -EINVAL;
287                         }
288
289                         break;
290
291                 case ARG_BIND:
292                 case ARG_BIND_RO: {
293                         _cleanup_free_ char *a = NULL, *b = NULL;
294                         char *e;
295                         char ***x;
296
297                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299                         e = strchr(optarg, ':');
300                         if (e) {
301                                 a = strndup(optarg, e - optarg);
302                                 b = strdup(e + 1);
303                         } else {
304                                 a = strdup(optarg);
305                                 b = strdup(optarg);
306                         }
307
308                         if (!a || !b)
309                                 return log_oom();
310
311                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
312                                 log_error("Invalid bind mount specification: %s", optarg);
313                                 return -EINVAL;
314                         }
315
316                         r = strv_extend(x, a);
317                         if (r < 0)
318                                 return r;
319
320                         r = strv_extend(x, b);
321                         if (r < 0)
322                                 return r;
323
324                         break;
325                 }
326
327                 case '?':
328                         return -EINVAL;
329
330                 default:
331                         log_error("Unknown option code %c", c);
332                         return -EINVAL;
333                 }
334         }
335
336         return 1;
337 }
338
339 static int mount_all(const char *dest) {
340
341         typedef struct MountPoint {
342                 const char *what;
343                 const char *where;
344                 const char *type;
345                 const char *options;
346                 unsigned long flags;
347                 bool fatal;
348         } MountPoint;
349
350         static const MountPoint mount_table[] = {
351                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
352                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
353                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
354                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
355                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
356                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
359 #ifdef HAVE_SELINUX
360                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
361                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
362 #endif
363         };
364
365         unsigned k;
366         int r = 0;
367
368         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369                 _cleanup_free_ char *where = NULL;
370                 int t;
371
372                 where = strjoin(dest, "/", mount_table[k].where, NULL);
373                 if (!where)
374                         return log_oom();
375
376                 t = path_is_mount_point(where, true);
377                 if (t < 0) {
378                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
379
380                         if (r == 0)
381                                 r = t;
382
383                         continue;
384                 }
385
386                 /* Skip this entry if it is not a remount. */
387                 if (mount_table[k].what && t > 0)
388                         continue;
389
390                 mkdir_p(where, 0755);
391
392                 if (mount(mount_table[k].what,
393                           where,
394                           mount_table[k].type,
395                           mount_table[k].flags,
396                           mount_table[k].options) < 0 &&
397                     mount_table[k].fatal) {
398
399                         log_error("mount(%s) failed: %m", where);
400
401                         if (r == 0)
402                                 r = -errno;
403                 }
404         }
405
406         return r;
407 }
408
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
410         char **x, **y;
411
412         STRV_FOREACH_PAIR(x, y, l) {
413                 _cleanup_free_ char *where = NULL;
414                 struct stat source_st, dest_st;
415
416                 if (stat(*x, &source_st) < 0) {
417                         log_error("failed to stat %s: %m", *x);
418                         return -errno;
419                 }
420
421                 where = strjoin(dest, "/", *y, NULL);
422                 if (!where)
423                         return log_oom();
424
425                 if (stat(where, &dest_st) == 0) {
426                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
427                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
428                                                 *x, where);
429                                 return -EINVAL;
430                         }
431                 } else {
432                         /* Create the mount point, but be conservative -- refuse to create block
433                          * and char devices. */
434                         if (S_ISDIR(source_st.st_mode))
435                                 mkdir_p_label(where, 0755);
436                         else if (S_ISFIFO(source_st.st_mode))
437                                 mkfifo(where, 0644);
438                         else if (S_ISSOCK(source_st.st_mode))
439                                 mknod(where, 0644 | S_IFSOCK, 0);
440                         else if (S_ISREG(source_st.st_mode))
441                                 touch(where);
442                         else {
443                                 log_error("Refusing to create mountpoint for file: %s", *x);
444                                 return -ENOTSUP;
445                         }
446                 }
447
448                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
449                         log_error("mount(%s) failed: %m", where);
450                         return -errno;
451                 }
452
453                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
454                         log_error("mount(%s) failed: %m", where);
455                         return -errno;
456                 }
457         }
458
459         return 0;
460 }
461
462 static int setup_timezone(const char *dest) {
463         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
464         char *z, *y;
465         int r;
466
467         assert(dest);
468
469         /* Fix the timezone, if possible */
470         r = readlink_malloc("/etc/localtime", &p);
471         if (r < 0) {
472                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
473                 return 0;
474         }
475
476         z = path_startswith(p, "../usr/share/zoneinfo/");
477         if (!z)
478                 z = path_startswith(p, "/usr/share/zoneinfo/");
479         if (!z) {
480                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
481                 return 0;
482         }
483
484         where = strappend(dest, "/etc/localtime");
485         if (!where)
486                 return log_oom();
487
488         r = readlink_malloc(where, &q);
489         if (r >= 0) {
490                 y = path_startswith(q, "../usr/share/zoneinfo/");
491                 if (!y)
492                         y = path_startswith(q, "/usr/share/zoneinfo/");
493
494
495                 /* Already pointing to the right place? Then do nothing .. */
496                 if (y && streq(y, z))
497                         return 0;
498         }
499
500         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
501         if (!check)
502                 return log_oom();
503
504         if (access(check, F_OK) < 0) {
505                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
506                 return 0;
507         }
508
509         what = strappend("../usr/share/zoneinfo/", z);
510         if (!what)
511                 return log_oom();
512
513         unlink(where);
514         if (symlink(what, where) < 0) {
515                 log_error("Failed to correct timezone of container: %m");
516                 return 0;
517         }
518
519         return 0;
520 }
521
522 static int setup_resolv_conf(const char *dest) {
523         char _cleanup_free_ *where = NULL;
524
525         assert(dest);
526
527         if (arg_private_network)
528                 return 0;
529
530         /* Fix resolv.conf, if possible */
531         where = strappend(dest, "/etc/resolv.conf");
532         if (!where)
533                 return log_oom();
534
535         /* We don't really care for the results of this really. If it
536          * fails, it fails, but meh... */
537         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
538
539         return 0;
540 }
541
542 static int setup_boot_id(const char *dest) {
543         _cleanup_free_ char *from = NULL, *to = NULL;
544         sd_id128_t rnd;
545         char as_uuid[37];
546         int r;
547
548         assert(dest);
549
550         /* Generate a new randomized boot ID, so that each boot-up of
551          * the container gets a new one */
552
553         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
554         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
555         if (!from || !to)
556                 return log_oom();
557
558         r = sd_id128_randomize(&rnd);
559         if (r < 0) {
560                 log_error("Failed to generate random boot id: %s", strerror(-r));
561                 return r;
562         }
563
564         snprintf(as_uuid, sizeof(as_uuid),
565                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
566                  SD_ID128_FORMAT_VAL(rnd));
567         char_array_0(as_uuid);
568
569         r = write_string_file(from, as_uuid);
570         if (r < 0) {
571                 log_error("Failed to write boot id: %s", strerror(-r));
572                 return r;
573         }
574
575         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
576                 log_error("Failed to bind mount boot id: %m");
577                 r = -errno;
578         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
579                 log_warning("Failed to make boot id read-only: %m");
580
581         unlink(from);
582         return r;
583 }
584
585 static int copy_devnodes(const char *dest) {
586
587         static const char devnodes[] =
588                 "null\0"
589                 "zero\0"
590                 "full\0"
591                 "random\0"
592                 "urandom\0"
593                 "tty\0";
594
595         const char *d;
596         int r = 0;
597         _cleanup_umask_ mode_t u;
598
599         assert(dest);
600
601         u = umask(0000);
602
603         NULSTR_FOREACH(d, devnodes) {
604                 struct stat st;
605                 _cleanup_free_ char *from = NULL, *to = NULL;
606
607                 asprintf(&from, "/dev/%s", d);
608                 asprintf(&to, "%s/dev/%s", dest, d);
609
610                 if (!from || !to) {
611                         log_oom();
612
613                         if (r == 0)
614                                 r = -ENOMEM;
615
616                         break;
617                 }
618
619                 if (stat(from, &st) < 0) {
620
621                         if (errno != ENOENT) {
622                                 log_error("Failed to stat %s: %m", from);
623                                 if (r == 0)
624                                         r = -errno;
625                         }
626
627                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
628
629                         log_error("%s is not a char or block device, cannot copy", from);
630                         if (r == 0)
631                                 r = -EIO;
632
633                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
634
635                         log_error("mknod(%s) failed: %m", dest);
636                         if (r == 0)
637                                 r = -errno;
638                 }
639         }
640
641         return r;
642 }
643
644 static int setup_ptmx(const char *dest) {
645         _cleanup_free_ char *p = NULL;
646
647         p = strappend(dest, "/dev/ptmx");
648         if (!p)
649                 return log_oom();
650
651         if (symlink("pts/ptmx", p) < 0) {
652                 log_error("Failed to create /dev/ptmx symlink: %m");
653                 return -errno;
654         }
655
656         return 0;
657 }
658
659 static int setup_dev_console(const char *dest, const char *console) {
660         struct stat st;
661         _cleanup_free_ char *to = NULL;
662         int r;
663         _cleanup_umask_ mode_t u;
664
665         assert(dest);
666         assert(console);
667
668         u = umask(0000);
669
670         if (stat(console, &st) < 0) {
671                 log_error("Failed to stat %s: %m", console);
672                 return -errno;
673
674         } else if (!S_ISCHR(st.st_mode)) {
675                 log_error("/dev/console is not a char device");
676                 return -EIO;
677         }
678
679         r = chmod_and_chown(console, 0600, 0, 0);
680         if (r < 0) {
681                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
682                 return r;
683         }
684
685         if (asprintf(&to, "%s/dev/console", dest) < 0)
686                 return log_oom();
687
688         /* We need to bind mount the right tty to /dev/console since
689          * ptys can only exist on pts file systems. To have something
690          * to bind mount things on we create a device node first, that
691          * has the right major/minor (note that the major minor
692          * doesn't actually matter here, since we mount it over
693          * anyway). */
694
695         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
696                 log_error("mknod() for /dev/console failed: %m");
697                 return -errno;
698         }
699
700         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
701                 log_error("Bind mount for /dev/console failed: %m");
702                 return -errno;
703         }
704
705         return 0;
706 }
707
708 static int setup_kmsg(const char *dest, int kmsg_socket) {
709         _cleanup_free_ char *from = NULL, *to = NULL;
710         int r, fd, k;
711         _cleanup_umask_ mode_t u;
712         union {
713                 struct cmsghdr cmsghdr;
714                 uint8_t buf[CMSG_SPACE(sizeof(int))];
715         } control = {};
716         struct msghdr mh = {
717                 .msg_control = &control,
718                 .msg_controllen = sizeof(control),
719         };
720         struct cmsghdr *cmsg;
721
722         assert(dest);
723         assert(kmsg_socket >= 0);
724
725         u = umask(0000);
726
727         /* We create the kmsg FIFO as /dev/kmsg, but immediately
728          * delete it after bind mounting it to /proc/kmsg. While FIFOs
729          * on the reading side behave very similar to /proc/kmsg,
730          * their writing side behaves differently from /dev/kmsg in
731          * that writing blocks when nothing is reading. In order to
732          * avoid any problems with containers deadlocking due to this
733          * we simply make /dev/kmsg unavailable to the container. */
734         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
735             asprintf(&to, "%s/proc/kmsg", dest) < 0)
736                 return log_oom();
737
738         if (mkfifo(from, 0600) < 0) {
739                 log_error("mkfifo() for /dev/kmsg failed: %m");
740                 return -errno;
741         }
742
743         r = chmod_and_chown(from, 0600, 0, 0);
744         if (r < 0) {
745                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
746                 return r;
747         }
748
749         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
750                 log_error("Bind mount for /proc/kmsg failed: %m");
751                 return -errno;
752         }
753
754         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
755         if (fd < 0) {
756                 log_error("Failed to open fifo: %m");
757                 return -errno;
758         }
759
760         cmsg = CMSG_FIRSTHDR(&mh);
761         cmsg->cmsg_level = SOL_SOCKET;
762         cmsg->cmsg_type = SCM_RIGHTS;
763         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
764         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
765
766         mh.msg_controllen = cmsg->cmsg_len;
767
768         /* Store away the fd in the socket, so that it stays open as
769          * long as we run the child */
770         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
771         close_nointr_nofail(fd);
772
773         if (k < 0) {
774                 log_error("Failed to send FIFO fd: %m");
775                 return -errno;
776         }
777
778         /* And now make the FIFO unavailable as /dev/kmsg... */
779         unlink(from);
780         return 0;
781 }
782
783 static int setup_hostname(void) {
784
785         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
786                 return -errno;
787
788         return 0;
789 }
790
791 static int setup_journal(const char *directory) {
792         sd_id128_t machine_id;
793         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
794         char *id;
795         int r;
796
797         if (arg_link_journal == LINK_NO)
798                 return 0;
799
800         p = strappend(directory, "/etc/machine-id");
801         if (!p)
802                 return log_oom();
803
804         r = read_one_line_file(p, &b);
805         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
806                 return 0;
807         else if (r < 0) {
808                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
809                 return r;
810         }
811
812         id = strstrip(b);
813         if (isempty(id) && arg_link_journal == LINK_AUTO)
814                 return 0;
815
816         /* Verify validity */
817         r = sd_id128_from_string(id, &machine_id);
818         if (r < 0) {
819                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
820                 return r;
821         }
822
823         free(p);
824         p = strappend("/var/log/journal/", id);
825         q = strjoin(directory, "/var/log/journal/", id, NULL);
826         if (!p || !q)
827                 return log_oom();
828
829         if (path_is_mount_point(p, false) > 0) {
830                 if (arg_link_journal != LINK_AUTO) {
831                         log_error("%s: already a mount point, refusing to use for journal", p);
832                         return -EEXIST;
833                 }
834
835                 return 0;
836         }
837
838         if (path_is_mount_point(q, false) > 0) {
839                 if (arg_link_journal != LINK_AUTO) {
840                         log_error("%s: already a mount point, refusing to use for journal", q);
841                         return -EEXIST;
842                 }
843
844                 return 0;
845         }
846
847         r = readlink_and_make_absolute(p, &d);
848         if (r >= 0) {
849                 if ((arg_link_journal == LINK_GUEST ||
850                      arg_link_journal == LINK_AUTO) &&
851                     path_equal(d, q)) {
852
853                         r = mkdir_p(q, 0755);
854                         if (r < 0)
855                                 log_warning("failed to create directory %s: %m", q);
856                         return 0;
857                 }
858
859                 if (unlink(p) < 0) {
860                         log_error("Failed to remove symlink %s: %m", p);
861                         return -errno;
862                 }
863         } else if (r == -EINVAL) {
864
865                 if (arg_link_journal == LINK_GUEST &&
866                     rmdir(p) < 0) {
867
868                         if (errno == ENOTDIR) {
869                                 log_error("%s already exists and is neither a symlink nor a directory", p);
870                                 return r;
871                         } else {
872                                 log_error("Failed to remove %s: %m", p);
873                                 return -errno;
874                         }
875                 }
876         } else if (r != -ENOENT) {
877                 log_error("readlink(%s) failed: %m", p);
878                 return r;
879         }
880
881         if (arg_link_journal == LINK_GUEST) {
882
883                 if (symlink(q, p) < 0) {
884                         log_error("Failed to symlink %s to %s: %m", q, p);
885                         return -errno;
886                 }
887
888                 r = mkdir_p(q, 0755);
889                 if (r < 0)
890                         log_warning("failed to create directory %s: %m", q);
891                 return 0;
892         }
893
894         if (arg_link_journal == LINK_HOST) {
895                 r = mkdir_p(p, 0755);
896                 if (r < 0) {
897                         log_error("Failed to create %s: %m", p);
898                         return r;
899                 }
900
901         } else if (access(p, F_OK) < 0)
902                 return 0;
903
904         if (dir_is_empty(q) == 0) {
905                 log_error("%s not empty.", q);
906                 return -ENOTEMPTY;
907         }
908
909         r = mkdir_p(q, 0755);
910         if (r < 0) {
911                 log_error("Failed to create %s: %m", q);
912                 return r;
913         }
914
915         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
916                 log_error("Failed to bind mount journal from host into guest: %m");
917                 return -errno;
918         }
919
920         return 0;
921 }
922
923 static int drop_capabilities(void) {
924         return capability_bounding_set_drop(~arg_retain, false);
925 }
926
927 static int register_machine(void) {
928         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
929         _cleanup_bus_unref_ sd_bus *bus = NULL;
930         int r;
931
932         r = sd_bus_open_system(&bus);
933         if (r < 0) {
934                 log_error("Failed to open system bus: %s", strerror(-r));
935                 return r;
936         }
937
938         r = sd_bus_call_method(
939                         bus,
940                         "org.freedesktop.machine1",
941                         "/org/freedesktop/machine1",
942                         "org.freedesktop.machine1.Manager",
943                         "CreateMachine",
944                         &error,
945                         NULL,
946                         "sayssusa(sv)",
947                         arg_machine,
948                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
949                         "nspawn",
950                         "container",
951                         (uint32_t) 0,
952                         strempty(arg_directory),
953                         !isempty(arg_slice), "Slice", "s", arg_slice);
954         if (r < 0) {
955                 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
956                 return r;
957         }
958
959         return 0;
960 }
961
962 static bool audit_enabled(void) {
963         int fd;
964
965         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
966         if (fd >= 0) {
967                 close_nointr_nofail(fd);
968                 return true;
969         }
970         return false;
971 }
972
973 int main(int argc, char *argv[]) {
974         pid_t pid = 0;
975         int r = EXIT_FAILURE, k;
976         _cleanup_close_ int master = -1;
977         int n_fd_passed;
978         const char *console = NULL;
979         struct termios saved_attr, raw_attr;
980         sigset_t mask;
981         bool saved_attr_valid = false;
982         struct winsize ws;
983         int kmsg_socket_pair[2] = { -1, -1 };
984         _cleanup_fdset_free_ FDSet *fds = NULL;
985
986         log_parse_environment();
987         log_open();
988
989         k = parse_argv(argc, argv);
990         if (k < 0)
991                 goto finish;
992         else if (k == 0) {
993                 r = EXIT_SUCCESS;
994                 goto finish;
995         }
996
997         if (arg_directory) {
998                 char *p;
999
1000                 p = path_make_absolute_cwd(arg_directory);
1001                 free(arg_directory);
1002                 arg_directory = p;
1003         } else
1004                 arg_directory = get_current_dir_name();
1005
1006         if (!arg_directory) {
1007                 log_error("Failed to determine path, please use -D.");
1008                 goto finish;
1009         }
1010
1011         path_kill_slashes(arg_directory);
1012
1013         if (!arg_machine) {
1014                 arg_machine = strdup(path_get_file_name(arg_directory));
1015                 if (!arg_machine) {
1016                         log_oom();
1017                         goto finish;
1018                 }
1019
1020                 hostname_cleanup(arg_machine, false);
1021                 if (isempty(arg_machine)) {
1022                         log_error("Failed to determine machine name automatically, please use -M.");
1023                         goto finish;
1024                 }
1025         }
1026
1027         if (geteuid() != 0) {
1028                 log_error("Need to be root.");
1029                 goto finish;
1030         }
1031
1032         if (sd_booted() <= 0) {
1033                 log_error("Not running on a systemd system.");
1034                 goto finish;
1035         }
1036
1037         if (arg_boot && audit_enabled()) {
1038                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1039                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1040                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1041                 sleep(5);
1042         }
1043
1044         if (path_equal(arg_directory, "/")) {
1045                 log_error("Spawning container on root directory not supported.");
1046                 goto finish;
1047         }
1048
1049         if (path_is_os_tree(arg_directory) <= 0) {
1050                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1051                 goto finish;
1052         }
1053
1054         log_close();
1055         n_fd_passed = sd_listen_fds(false);
1056         if (n_fd_passed > 0) {
1057                 k = fdset_new_listen_fds(&fds, false);
1058                 if (k < 0) {
1059                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1060                         goto finish;
1061                 }
1062         }
1063         fdset_close_others(fds);
1064         log_open();
1065
1066         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1067         if (master < 0) {
1068                 log_error("Failed to acquire pseudo tty: %m");
1069                 goto finish;
1070         }
1071
1072         console = ptsname(master);
1073         if (!console) {
1074                 log_error("Failed to determine tty name: %m");
1075                 goto finish;
1076         }
1077
1078         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1079
1080         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1081                 ioctl(master, TIOCSWINSZ, &ws);
1082
1083         if (unlockpt(master) < 0) {
1084                 log_error("Failed to unlock tty: %m");
1085                 goto finish;
1086         }
1087
1088         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1089                 saved_attr_valid = true;
1090
1091                 raw_attr = saved_attr;
1092                 cfmakeraw(&raw_attr);
1093                 raw_attr.c_lflag &= ~ECHO;
1094         }
1095
1096         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1097                 log_error("Failed to create kmsg socket pair.");
1098                 goto finish;
1099         }
1100
1101         sd_notify(0, "READY=1");
1102
1103         assert_se(sigemptyset(&mask) == 0);
1104         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1105         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1106
1107         for (;;) {
1108                 siginfo_t status;
1109                 int pipefd[2], pipefd2[2];
1110
1111                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1112                         log_error("pipe2(): %m");
1113                         goto finish;
1114                 }
1115
1116                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1117                         log_error("pipe2(): %m");
1118                         close_pipe(pipefd);
1119                         goto finish;
1120                 }
1121
1122                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1123                 if (pid < 0) {
1124                         if (errno == EINVAL)
1125                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1126                         else
1127                                 log_error("clone() failed: %m");
1128
1129                         goto finish;
1130                 }
1131
1132                 if (pid == 0) {
1133                         /* child */
1134                         const char *home = NULL;
1135                         uid_t uid = (uid_t) -1;
1136                         gid_t gid = (gid_t) -1;
1137                         unsigned n_env = 2;
1138                         const char *envp[] = {
1139                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1140                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1141                                 NULL, /* TERM */
1142                                 NULL, /* HOME */
1143                                 NULL, /* USER */
1144                                 NULL, /* LOGNAME */
1145                                 NULL, /* container_uuid */
1146                                 NULL, /* LISTEN_FDS */
1147                                 NULL, /* LISTEN_PID */
1148                                 NULL
1149                         };
1150
1151                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1152                         if (envp[n_env])
1153                                 n_env ++;
1154
1155                         /* Wait for the parent process to log our PID */
1156                         close_nointr_nofail(pipefd[1]);
1157                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1158                         close_nointr_nofail(pipefd[0]);
1159
1160                         close_nointr_nofail(master);
1161                         master = -1;
1162
1163                         if (saved_attr_valid) {
1164                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1165                                         log_error("Failed to set terminal attributes: %m");
1166                                         goto child_fail;
1167                                 }
1168                         }
1169
1170                         close_nointr(STDIN_FILENO);
1171                         close_nointr(STDOUT_FILENO);
1172                         close_nointr(STDERR_FILENO);
1173
1174                         close_nointr_nofail(kmsg_socket_pair[0]);
1175                         kmsg_socket_pair[0] = -1;
1176
1177                         reset_all_signal_handlers();
1178
1179                         assert_se(sigemptyset(&mask) == 0);
1180                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1181
1182                         k = open_terminal(console, O_RDWR);
1183                         if (k != STDIN_FILENO) {
1184                                 if (k >= 0) {
1185                                         close_nointr_nofail(k);
1186                                         k = -EINVAL;
1187                                 }
1188
1189                                 log_error("Failed to open console: %s", strerror(-k));
1190                                 goto child_fail;
1191                         }
1192
1193                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1194                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1195                                 log_error("Failed to duplicate console: %m");
1196                                 goto child_fail;
1197                         }
1198
1199                         if (setsid() < 0) {
1200                                 log_error("setsid() failed: %m");
1201                                 goto child_fail;
1202                         }
1203
1204                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1205                                 log_error("PR_SET_PDEATHSIG failed: %m");
1206                                 goto child_fail;
1207                         }
1208
1209                         close_pipe(pipefd2);
1210
1211                         r = register_machine();
1212                         if (r < 0)
1213                                 goto finish;
1214
1215                         /* Mark everything as slave, so that we still
1216                          * receive mounts from the real root, but don't
1217                          * propagate mounts to the real root. */
1218                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1219                                 log_error("MS_SLAVE|MS_REC failed: %m");
1220                                 goto child_fail;
1221                         }
1222
1223                         /* Turn directory into bind mount */
1224                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1225                                 log_error("Failed to make bind mount.");
1226                                 goto child_fail;
1227                         }
1228
1229                         if (arg_read_only)
1230                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1231                                         log_error("Failed to make read-only.");
1232                                         goto child_fail;
1233                                 }
1234
1235                         if (mount_all(arg_directory) < 0)
1236                                 goto child_fail;
1237
1238                         if (copy_devnodes(arg_directory) < 0)
1239                                 goto child_fail;
1240
1241                         if (setup_ptmx(arg_directory) < 0)
1242                                 goto child_fail;
1243
1244                         dev_setup(arg_directory);
1245
1246                         if (setup_dev_console(arg_directory, console) < 0)
1247                                 goto child_fail;
1248
1249                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1250                                 goto child_fail;
1251
1252                         close_nointr_nofail(kmsg_socket_pair[1]);
1253                         kmsg_socket_pair[1] = -1;
1254
1255                         if (setup_boot_id(arg_directory) < 0)
1256                                 goto child_fail;
1257
1258                         if (setup_timezone(arg_directory) < 0)
1259                                 goto child_fail;
1260
1261                         if (setup_resolv_conf(arg_directory) < 0)
1262                                 goto child_fail;
1263
1264                         if (setup_journal(arg_directory) < 0)
1265                                 goto child_fail;
1266
1267                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1268                                 goto child_fail;
1269
1270                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1271                                 goto child_fail;
1272
1273                         if (chdir(arg_directory) < 0) {
1274                                 log_error("chdir(%s) failed: %m", arg_directory);
1275                                 goto child_fail;
1276                         }
1277
1278                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1279                                 log_error("mount(MS_MOVE) failed: %m");
1280                                 goto child_fail;
1281                         }
1282
1283                         if (chroot(".") < 0) {
1284                                 log_error("chroot() failed: %m");
1285                                 goto child_fail;
1286                         }
1287
1288                         if (chdir("/") < 0) {
1289                                 log_error("chdir() failed: %m");
1290                                 goto child_fail;
1291                         }
1292
1293                         umask(0022);
1294
1295                         loopback_setup();
1296
1297                         if (drop_capabilities() < 0) {
1298                                 log_error("drop_capabilities() failed: %m");
1299                                 goto child_fail;
1300                         }
1301
1302                         if (arg_user) {
1303
1304                                 /* Note that this resolves user names
1305                                  * inside the container, and hence
1306                                  * accesses the NSS modules from the
1307                                  * container and not the host. This is
1308                                  * a bit weird... */
1309
1310                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1311                                         log_error("get_user_creds() failed: %m");
1312                                         goto child_fail;
1313                                 }
1314
1315                                 if (mkdir_parents_label(home, 0775) < 0) {
1316                                         log_error("mkdir_parents_label() failed: %m");
1317                                         goto child_fail;
1318                                 }
1319
1320                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1321                                         log_error("mkdir_safe_label() failed: %m");
1322                                         goto child_fail;
1323                                 }
1324
1325                                 if (initgroups((const char*)arg_user, gid) < 0) {
1326                                         log_error("initgroups() failed: %m");
1327                                         goto child_fail;
1328                                 }
1329
1330                                 if (setresgid(gid, gid, gid) < 0) {
1331                                         log_error("setregid() failed: %m");
1332                                         goto child_fail;
1333                                 }
1334
1335                                 if (setresuid(uid, uid, uid) < 0) {
1336                                         log_error("setreuid() failed: %m");
1337                                         goto child_fail;
1338                                 }
1339                         } else {
1340                                 /* Reset everything fully to 0, just in case */
1341
1342                                 if (setgroups(0, NULL) < 0) {
1343                                         log_error("setgroups() failed: %m");
1344                                         goto child_fail;
1345                                 }
1346
1347                                 if (setresgid(0, 0, 0) < 0) {
1348                                         log_error("setregid() failed: %m");
1349                                         goto child_fail;
1350                                 }
1351
1352                                 if (setresuid(0, 0, 0) < 0) {
1353                                         log_error("setreuid() failed: %m");
1354                                         goto child_fail;
1355                                 }
1356                         }
1357
1358                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1359                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1360                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1361                                 log_oom();
1362                                 goto child_fail;
1363                         }
1364
1365                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1366                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1367                                         log_oom();
1368                                         goto child_fail;
1369                                 }
1370                         }
1371
1372                         if (fdset_size(fds) > 0) {
1373                                 k = fdset_cloexec(fds, false);
1374                                 if (k < 0) {
1375                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1376                                         goto child_fail;
1377                                 }
1378
1379                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1380                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1381                                         log_oom();
1382                                         goto child_fail;
1383                                 }
1384                         }
1385
1386                         setup_hostname();
1387
1388                         if (arg_boot) {
1389                                 char **a;
1390                                 size_t l;
1391
1392                                 /* Automatically search for the init system */
1393
1394                                 l = 1 + argc - optind;
1395                                 a = newa(char*, l + 1);
1396                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1397
1398                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1399                                 execve(a[0], a, (char**) envp);
1400
1401                                 a[0] = (char*) "/lib/systemd/systemd";
1402                                 execve(a[0], a, (char**) envp);
1403
1404                                 a[0] = (char*) "/sbin/init";
1405                                 execve(a[0], a, (char**) envp);
1406                         } else if (argc > optind)
1407                                 execvpe(argv[optind], argv + optind, (char**) envp);
1408                         else {
1409                                 chdir(home ? home : "/root");
1410                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1411                         }
1412
1413                         log_error("execv() failed: %m");
1414
1415                 child_fail:
1416                         _exit(EXIT_FAILURE);
1417                 }
1418
1419                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1420                 close_nointr_nofail(pipefd[0]);
1421                 close_nointr_nofail(pipefd[1]);
1422
1423                 /* Wait for the child process to establish cgroup hierarchy */
1424                 close_nointr_nofail(pipefd2[1]);
1425                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1426                 close_nointr_nofail(pipefd2[0]);
1427
1428                 fdset_free(fds);
1429                 fds = NULL;
1430
1431                 if (process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3) < 0)
1432                         goto finish;
1433
1434                 if (saved_attr_valid)
1435                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1436
1437                 k = wait_for_terminate(pid, &status);
1438                 if (k < 0) {
1439                         r = EXIT_FAILURE;
1440                         break;
1441                 }
1442
1443                 if (status.si_code == CLD_EXITED) {
1444                         r = status.si_status;
1445                         if (status.si_status != 0) {
1446                                 log_error("Container failed with error code %i.", status.si_status);
1447                                 break;
1448                         }
1449
1450                         log_debug("Container exited successfully.");
1451                         break;
1452                 } else if (status.si_code == CLD_KILLED &&
1453                            status.si_status == SIGINT) {
1454                         log_info("Container has been shut down.");
1455                         r = 0;
1456                         break;
1457                 } else if (status.si_code == CLD_KILLED &&
1458                            status.si_status == SIGHUP) {
1459                         log_info("Container is being rebooted.");
1460                         continue;
1461                 } else if (status.si_code == CLD_KILLED ||
1462                            status.si_code == CLD_DUMPED) {
1463
1464                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1465                         r = EXIT_FAILURE;
1466                         break;
1467                 } else {
1468                         log_error("Container failed due to unknown reason.");
1469                         r = EXIT_FAILURE;
1470                         break;
1471                 }
1472         }
1473
1474 finish:
1475         if (saved_attr_valid)
1476                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1477
1478         close_pipe(kmsg_socket_pair);
1479
1480         if (pid > 0)
1481                 kill(pid, SIGKILL);
1482
1483         free(arg_directory);
1484         free(arg_machine);
1485
1486         return r;
1487 }