chiark / gitweb /
bus: connect directly via kdbus in sd_bus_open_system_container()
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43
44 #include "sd-daemon.h"
45 #include "sd-bus.h"
46 #include "sd-id128.h"
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60 #include "fileio.h"
61 #include "bus-util.h"
62 #include "bus-error.h"
63 #include "ptyfwd.h"
64 #include "bus-kernel.h"
65
66 #ifndef TTY_GID
67 #define TTY_GID 5
68 #endif
69
70 typedef enum LinkJournal {
71         LINK_NO,
72         LINK_AUTO,
73         LINK_HOST,
74         LINK_GUEST
75 } LinkJournal;
76
77 static char *arg_directory = NULL;
78 static char *arg_user = NULL;
79 static sd_id128_t arg_uuid = {};
80 static char *arg_machine = NULL;
81 static const char *arg_slice = NULL;
82 static bool arg_private_network = false;
83 static bool arg_read_only = false;
84 static bool arg_boot = false;
85 static LinkJournal arg_link_journal = LINK_AUTO;
86 static uint64_t arg_retain =
87         (1ULL << CAP_CHOWN) |
88         (1ULL << CAP_DAC_OVERRIDE) |
89         (1ULL << CAP_DAC_READ_SEARCH) |
90         (1ULL << CAP_FOWNER) |
91         (1ULL << CAP_FSETID) |
92         (1ULL << CAP_IPC_OWNER) |
93         (1ULL << CAP_KILL) |
94         (1ULL << CAP_LEASE) |
95         (1ULL << CAP_LINUX_IMMUTABLE) |
96         (1ULL << CAP_NET_BIND_SERVICE) |
97         (1ULL << CAP_NET_BROADCAST) |
98         (1ULL << CAP_NET_RAW) |
99         (1ULL << CAP_SETGID) |
100         (1ULL << CAP_SETFCAP) |
101         (1ULL << CAP_SETPCAP) |
102         (1ULL << CAP_SETUID) |
103         (1ULL << CAP_SYS_ADMIN) |
104         (1ULL << CAP_SYS_CHROOT) |
105         (1ULL << CAP_SYS_NICE) |
106         (1ULL << CAP_SYS_PTRACE) |
107         (1ULL << CAP_SYS_TTY_CONFIG) |
108         (1ULL << CAP_SYS_RESOURCE) |
109         (1ULL << CAP_SYS_BOOT) |
110         (1ULL << CAP_AUDIT_WRITE) |
111         (1ULL << CAP_AUDIT_CONTROL);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
114
115 static int help(void) {
116
117         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
118                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
119                "  -h --help                Show this help\n"
120                "     --version             Print version string\n"
121                "  -D --directory=NAME      Root directory for the container\n"
122                "  -b --boot                Boot up full system (i.e. invoke init)\n"
123                "  -u --user=USER           Run the command under specified user or uid\n"
124                "     --uuid=UUID           Set a specific machine UUID for the container\n"
125                "  -M --machine=NAME        Set the machine name for the container\n"
126                "  -S --slice=SLICE         Place the container in the specified slice\n"
127                "     --private-network     Disable network in container\n"
128                "     --read-only           Mount the root directory read-only\n"
129                "     --capability=CAP      In addition to the default, retain specified\n"
130                "                           capability\n"
131                "     --drop-capability=CAP Drop the specified capability from the default set\n"
132                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
133                "  -j                       Equivalent to --link-journal=host\n"
134                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
135                "                           the container\n"
136                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137                program_invocation_short_name);
138
139         return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144         enum {
145                 ARG_VERSION = 0x100,
146                 ARG_PRIVATE_NETWORK,
147                 ARG_UUID,
148                 ARG_READ_ONLY,
149                 ARG_CAPABILITY,
150                 ARG_DROP_CAPABILITY,
151                 ARG_LINK_JOURNAL,
152                 ARG_BIND,
153                 ARG_BIND_RO
154         };
155
156         static const struct option options[] = {
157                 { "help",            no_argument,       NULL, 'h'                 },
158                 { "version",         no_argument,       NULL, ARG_VERSION         },
159                 { "directory",       required_argument, NULL, 'D'                 },
160                 { "user",            required_argument, NULL, 'u'                 },
161                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
162                 { "boot",            no_argument,       NULL, 'b'                 },
163                 { "uuid",            required_argument, NULL, ARG_UUID            },
164                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
165                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
166                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
167                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
168                 { "bind",            required_argument, NULL, ARG_BIND            },
169                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
170                 { "machine",         required_argument, NULL, 'M'                 },
171                 { "slice",           required_argument, NULL, 'S'                 },
172                 {}
173         };
174
175         int c, r;
176
177         assert(argc >= 0);
178         assert(argv);
179
180         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
181
182                 switch (c) {
183
184                 case 'h':
185                         return help();
186
187                 case ARG_VERSION:
188                         puts(PACKAGE_STRING);
189                         puts(SYSTEMD_FEATURES);
190                         return 0;
191
192                 case 'D':
193                         free(arg_directory);
194                         arg_directory = canonicalize_file_name(optarg);
195                         if (!arg_directory) {
196                                 log_error("Invalid root directory: %m");
197                                 return -ENOMEM;
198                         }
199
200                         break;
201
202                 case 'u':
203                         free(arg_user);
204                         arg_user = strdup(optarg);
205                         if (!arg_user)
206                                 return log_oom();
207
208                         break;
209
210                 case ARG_PRIVATE_NETWORK:
211                         arg_private_network = true;
212                         break;
213
214                 case 'b':
215                         arg_boot = true;
216                         break;
217
218                 case ARG_UUID:
219                         r = sd_id128_from_string(optarg, &arg_uuid);
220                         if (r < 0) {
221                                 log_error("Invalid UUID: %s", optarg);
222                                 return r;
223                         }
224                         break;
225
226                 case 'S':
227                         arg_slice = strdup(optarg);
228                         if (!arg_slice)
229                                 return log_oom();
230
231                         break;
232
233                 case 'M':
234                         if (!hostname_is_valid(optarg)) {
235                                 log_error("Invalid machine name: %s", optarg);
236                                 return -EINVAL;
237                         }
238
239                         free(arg_machine);
240                         arg_machine = strdup(optarg);
241                         if (!arg_machine)
242                                 return log_oom();
243
244                         break;
245
246                 case ARG_READ_ONLY:
247                         arg_read_only = true;
248                         break;
249
250                 case ARG_CAPABILITY:
251                 case ARG_DROP_CAPABILITY: {
252                         char *state, *word;
253                         size_t length;
254
255                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
256                                 cap_value_t cap;
257                                 char *t;
258
259                                 t = strndup(word, length);
260                                 if (!t)
261                                         return log_oom();
262
263                                 if (cap_from_name(t, &cap) < 0) {
264                                         log_error("Failed to parse capability %s.", t);
265                                         free(t);
266                                         return -EINVAL;
267                                 }
268
269                                 free(t);
270
271                                 if (c == ARG_CAPABILITY)
272                                         arg_retain |= 1ULL << (uint64_t) cap;
273                                 else
274                                         arg_retain &= ~(1ULL << (uint64_t) cap);
275                         }
276
277                         break;
278                 }
279
280                 case 'j':
281                         arg_link_journal = LINK_GUEST;
282                         break;
283
284                 case ARG_LINK_JOURNAL:
285                         if (streq(optarg, "auto"))
286                                 arg_link_journal = LINK_AUTO;
287                         else if (streq(optarg, "no"))
288                                 arg_link_journal = LINK_NO;
289                         else if (streq(optarg, "guest"))
290                                 arg_link_journal = LINK_GUEST;
291                         else if (streq(optarg, "host"))
292                                 arg_link_journal = LINK_HOST;
293                         else {
294                                 log_error("Failed to parse link journal mode %s", optarg);
295                                 return -EINVAL;
296                         }
297
298                         break;
299
300                 case ARG_BIND:
301                 case ARG_BIND_RO: {
302                         _cleanup_free_ char *a = NULL, *b = NULL;
303                         char *e;
304                         char ***x;
305
306                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
307
308                         e = strchr(optarg, ':');
309                         if (e) {
310                                 a = strndup(optarg, e - optarg);
311                                 b = strdup(e + 1);
312                         } else {
313                                 a = strdup(optarg);
314                                 b = strdup(optarg);
315                         }
316
317                         if (!a || !b)
318                                 return log_oom();
319
320                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
321                                 log_error("Invalid bind mount specification: %s", optarg);
322                                 return -EINVAL;
323                         }
324
325                         r = strv_extend(x, a);
326                         if (r < 0)
327                                 return log_oom();
328
329                         r = strv_extend(x, b);
330                         if (r < 0)
331                                 return log_oom();
332
333                         break;
334                 }
335
336                 case '?':
337                         return -EINVAL;
338
339                 default:
340                         assert_not_reached("Unhandled option");
341                 }
342         }
343
344         return 1;
345 }
346
347 static int mount_all(const char *dest) {
348
349         typedef struct MountPoint {
350                 const char *what;
351                 const char *where;
352                 const char *type;
353                 const char *options;
354                 unsigned long flags;
355                 bool fatal;
356         } MountPoint;
357
358         static const MountPoint mount_table[] = {
359                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
361                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
362                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
363                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
364                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
366                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
367 #ifdef HAVE_SELINUX
368                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
369                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
370 #endif
371         };
372
373         unsigned k;
374         int r = 0;
375
376         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377                 _cleanup_free_ char *where = NULL;
378                 int t;
379
380                 where = strjoin(dest, "/", mount_table[k].where, NULL);
381                 if (!where)
382                         return log_oom();
383
384                 t = path_is_mount_point(where, true);
385                 if (t < 0) {
386                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387
388                         if (r == 0)
389                                 r = t;
390
391                         continue;
392                 }
393
394                 /* Skip this entry if it is not a remount. */
395                 if (mount_table[k].what && t > 0)
396                         continue;
397
398                 mkdir_p(where, 0755);
399
400                 if (mount(mount_table[k].what,
401                           where,
402                           mount_table[k].type,
403                           mount_table[k].flags,
404                           mount_table[k].options) < 0 &&
405                     mount_table[k].fatal) {
406
407                         log_error("mount(%s) failed: %m", where);
408
409                         if (r == 0)
410                                 r = -errno;
411                 }
412         }
413
414         return r;
415 }
416
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
418         char **x, **y;
419
420         STRV_FOREACH_PAIR(x, y, l) {
421                 char *where;
422                 struct stat source_st, dest_st;
423                 int r;
424
425                 if (stat(*x, &source_st) < 0) {
426                         log_error("failed to stat %s: %m", *x);
427                         return -errno;
428                 }
429
430                 where = strappenda(dest, *y);
431                 r = stat(where, &dest_st);
432                 if (r == 0) {
433                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
434                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
435                                                 *x, where);
436                                 return -EINVAL;
437                         }
438                 } else if (errno == ENOENT) {
439                         r = mkdir_parents_label(where, 0755);
440                         if (r < 0) {
441                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
442                                 return r;
443                         }
444                 } else {
445                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
446                         return -errno;
447                 }
448                 /* Create the mount point, but be conservative -- refuse to create block
449                 * and char devices. */
450                 if (S_ISDIR(source_st.st_mode))
451                         mkdir_label(where, 0755);
452                 else if (S_ISFIFO(source_st.st_mode))
453                         mkfifo(where, 0644);
454                 else if (S_ISSOCK(source_st.st_mode))
455                         mknod(where, 0644 | S_IFSOCK, 0);
456                 else if (S_ISREG(source_st.st_mode))
457                         touch(where);
458                 else {
459                         log_error("Refusing to create mountpoint for file: %s", *x);
460                         return -ENOTSUP;
461                 }
462
463                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
464                         log_error("mount(%s) failed: %m", where);
465                         return -errno;
466                 }
467
468                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
469                         log_error("mount(%s) failed: %m", where);
470                         return -errno;
471                 }
472         }
473
474         return 0;
475 }
476
477 static int setup_timezone(const char *dest) {
478         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
479         char *z, *y;
480         int r;
481
482         assert(dest);
483
484         /* Fix the timezone, if possible */
485         r = readlink_malloc("/etc/localtime", &p);
486         if (r < 0) {
487                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
488                 return 0;
489         }
490
491         z = path_startswith(p, "../usr/share/zoneinfo/");
492         if (!z)
493                 z = path_startswith(p, "/usr/share/zoneinfo/");
494         if (!z) {
495                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
496                 return 0;
497         }
498
499         where = strappend(dest, "/etc/localtime");
500         if (!where)
501                 return log_oom();
502
503         r = readlink_malloc(where, &q);
504         if (r >= 0) {
505                 y = path_startswith(q, "../usr/share/zoneinfo/");
506                 if (!y)
507                         y = path_startswith(q, "/usr/share/zoneinfo/");
508
509
510                 /* Already pointing to the right place? Then do nothing .. */
511                 if (y && streq(y, z))
512                         return 0;
513         }
514
515         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
516         if (!check)
517                 return log_oom();
518
519         if (access(check, F_OK) < 0) {
520                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
521                 return 0;
522         }
523
524         what = strappend("../usr/share/zoneinfo/", z);
525         if (!what)
526                 return log_oom();
527
528         unlink(where);
529         if (symlink(what, where) < 0) {
530                 log_error("Failed to correct timezone of container: %m");
531                 return 0;
532         }
533
534         return 0;
535 }
536
537 static int setup_resolv_conf(const char *dest) {
538         char _cleanup_free_ *where = NULL;
539
540         assert(dest);
541
542         if (arg_private_network)
543                 return 0;
544
545         /* Fix resolv.conf, if possible */
546         where = strappend(dest, "/etc/resolv.conf");
547         if (!where)
548                 return log_oom();
549
550         /* We don't really care for the results of this really. If it
551          * fails, it fails, but meh... */
552         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
553
554         return 0;
555 }
556
557 static int setup_boot_id(const char *dest) {
558         _cleanup_free_ char *from = NULL, *to = NULL;
559         sd_id128_t rnd;
560         char as_uuid[37];
561         int r;
562
563         assert(dest);
564
565         /* Generate a new randomized boot ID, so that each boot-up of
566          * the container gets a new one */
567
568         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
569         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
570         if (!from || !to)
571                 return log_oom();
572
573         r = sd_id128_randomize(&rnd);
574         if (r < 0) {
575                 log_error("Failed to generate random boot id: %s", strerror(-r));
576                 return r;
577         }
578
579         snprintf(as_uuid, sizeof(as_uuid),
580                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
581                  SD_ID128_FORMAT_VAL(rnd));
582         char_array_0(as_uuid);
583
584         r = write_string_file(from, as_uuid);
585         if (r < 0) {
586                 log_error("Failed to write boot id: %s", strerror(-r));
587                 return r;
588         }
589
590         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
591                 log_error("Failed to bind mount boot id: %m");
592                 r = -errno;
593         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
594                 log_warning("Failed to make boot id read-only: %m");
595
596         unlink(from);
597         return r;
598 }
599
600 static int copy_devnodes(const char *dest) {
601
602         static const char devnodes[] =
603                 "null\0"
604                 "zero\0"
605                 "full\0"
606                 "random\0"
607                 "urandom\0"
608                 "tty\0";
609
610         const char *d;
611         int r = 0;
612         _cleanup_umask_ mode_t u;
613
614         assert(dest);
615
616         u = umask(0000);
617
618         NULSTR_FOREACH(d, devnodes) {
619                 struct stat st;
620                 _cleanup_free_ char *from = NULL, *to = NULL;
621
622                 asprintf(&from, "/dev/%s", d);
623                 asprintf(&to, "%s/dev/%s", dest, d);
624
625                 if (!from || !to) {
626                         log_oom();
627
628                         if (r == 0)
629                                 r = -ENOMEM;
630
631                         break;
632                 }
633
634                 if (stat(from, &st) < 0) {
635
636                         if (errno != ENOENT) {
637                                 log_error("Failed to stat %s: %m", from);
638                                 if (r == 0)
639                                         r = -errno;
640                         }
641
642                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
643
644                         log_error("%s is not a char or block device, cannot copy", from);
645                         if (r == 0)
646                                 r = -EIO;
647
648                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
649
650                         log_error("mknod(%s) failed: %m", dest);
651                         if (r == 0)
652                                 r = -errno;
653                 }
654         }
655
656         return r;
657 }
658
659 static int setup_ptmx(const char *dest) {
660         _cleanup_free_ char *p = NULL;
661
662         p = strappend(dest, "/dev/ptmx");
663         if (!p)
664                 return log_oom();
665
666         if (symlink("pts/ptmx", p) < 0) {
667                 log_error("Failed to create /dev/ptmx symlink: %m");
668                 return -errno;
669         }
670
671         return 0;
672 }
673
674 static int setup_dev_console(const char *dest, const char *console) {
675         struct stat st;
676         _cleanup_free_ char *to = NULL;
677         int r;
678         _cleanup_umask_ mode_t u;
679
680         assert(dest);
681         assert(console);
682
683         u = umask(0000);
684
685         if (stat(console, &st) < 0) {
686                 log_error("Failed to stat %s: %m", console);
687                 return -errno;
688
689         } else if (!S_ISCHR(st.st_mode)) {
690                 log_error("/dev/console is not a char device");
691                 return -EIO;
692         }
693
694         r = chmod_and_chown(console, 0600, 0, 0);
695         if (r < 0) {
696                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
697                 return r;
698         }
699
700         if (asprintf(&to, "%s/dev/console", dest) < 0)
701                 return log_oom();
702
703         /* We need to bind mount the right tty to /dev/console since
704          * ptys can only exist on pts file systems. To have something
705          * to bind mount things on we create a device node first, that
706          * has the right major/minor (note that the major minor
707          * doesn't actually matter here, since we mount it over
708          * anyway). */
709
710         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
711                 log_error("mknod() for /dev/console failed: %m");
712                 return -errno;
713         }
714
715         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
716                 log_error("Bind mount for /dev/console failed: %m");
717                 return -errno;
718         }
719
720         return 0;
721 }
722
723 static int setup_kmsg(const char *dest, int kmsg_socket) {
724         _cleanup_free_ char *from = NULL, *to = NULL;
725         int r, fd, k;
726         _cleanup_umask_ mode_t u;
727         union {
728                 struct cmsghdr cmsghdr;
729                 uint8_t buf[CMSG_SPACE(sizeof(int))];
730         } control = {};
731         struct msghdr mh = {
732                 .msg_control = &control,
733                 .msg_controllen = sizeof(control),
734         };
735         struct cmsghdr *cmsg;
736
737         assert(dest);
738         assert(kmsg_socket >= 0);
739
740         u = umask(0000);
741
742         /* We create the kmsg FIFO as /dev/kmsg, but immediately
743          * delete it after bind mounting it to /proc/kmsg. While FIFOs
744          * on the reading side behave very similar to /proc/kmsg,
745          * their writing side behaves differently from /dev/kmsg in
746          * that writing blocks when nothing is reading. In order to
747          * avoid any problems with containers deadlocking due to this
748          * we simply make /dev/kmsg unavailable to the container. */
749         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
750             asprintf(&to, "%s/proc/kmsg", dest) < 0)
751                 return log_oom();
752
753         if (mkfifo(from, 0600) < 0) {
754                 log_error("mkfifo() for /dev/kmsg failed: %m");
755                 return -errno;
756         }
757
758         r = chmod_and_chown(from, 0600, 0, 0);
759         if (r < 0) {
760                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
761                 return r;
762         }
763
764         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
765                 log_error("Bind mount for /proc/kmsg failed: %m");
766                 return -errno;
767         }
768
769         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
770         if (fd < 0) {
771                 log_error("Failed to open fifo: %m");
772                 return -errno;
773         }
774
775         cmsg = CMSG_FIRSTHDR(&mh);
776         cmsg->cmsg_level = SOL_SOCKET;
777         cmsg->cmsg_type = SCM_RIGHTS;
778         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
779         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
780
781         mh.msg_controllen = cmsg->cmsg_len;
782
783         /* Store away the fd in the socket, so that it stays open as
784          * long as we run the child */
785         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
786         close_nointr_nofail(fd);
787
788         if (k < 0) {
789                 log_error("Failed to send FIFO fd: %m");
790                 return -errno;
791         }
792
793         /* And now make the FIFO unavailable as /dev/kmsg... */
794         unlink(from);
795         return 0;
796 }
797
798 static int setup_hostname(void) {
799
800         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
801                 return -errno;
802
803         return 0;
804 }
805
806 static int setup_journal(const char *directory) {
807         sd_id128_t machine_id;
808         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
809         char *id;
810         int r;
811
812         if (arg_link_journal == LINK_NO)
813                 return 0;
814
815         p = strappend(directory, "/etc/machine-id");
816         if (!p)
817                 return log_oom();
818
819         r = read_one_line_file(p, &b);
820         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
821                 return 0;
822         else if (r < 0) {
823                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
824                 return r;
825         }
826
827         id = strstrip(b);
828         if (isempty(id) && arg_link_journal == LINK_AUTO)
829                 return 0;
830
831         /* Verify validity */
832         r = sd_id128_from_string(id, &machine_id);
833         if (r < 0) {
834                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
835                 return r;
836         }
837
838         free(p);
839         p = strappend("/var/log/journal/", id);
840         q = strjoin(directory, "/var/log/journal/", id, NULL);
841         if (!p || !q)
842                 return log_oom();
843
844         if (path_is_mount_point(p, false) > 0) {
845                 if (arg_link_journal != LINK_AUTO) {
846                         log_error("%s: already a mount point, refusing to use for journal", p);
847                         return -EEXIST;
848                 }
849
850                 return 0;
851         }
852
853         if (path_is_mount_point(q, false) > 0) {
854                 if (arg_link_journal != LINK_AUTO) {
855                         log_error("%s: already a mount point, refusing to use for journal", q);
856                         return -EEXIST;
857                 }
858
859                 return 0;
860         }
861
862         r = readlink_and_make_absolute(p, &d);
863         if (r >= 0) {
864                 if ((arg_link_journal == LINK_GUEST ||
865                      arg_link_journal == LINK_AUTO) &&
866                     path_equal(d, q)) {
867
868                         r = mkdir_p(q, 0755);
869                         if (r < 0)
870                                 log_warning("failed to create directory %s: %m", q);
871                         return 0;
872                 }
873
874                 if (unlink(p) < 0) {
875                         log_error("Failed to remove symlink %s: %m", p);
876                         return -errno;
877                 }
878         } else if (r == -EINVAL) {
879
880                 if (arg_link_journal == LINK_GUEST &&
881                     rmdir(p) < 0) {
882
883                         if (errno == ENOTDIR) {
884                                 log_error("%s already exists and is neither a symlink nor a directory", p);
885                                 return r;
886                         } else {
887                                 log_error("Failed to remove %s: %m", p);
888                                 return -errno;
889                         }
890                 }
891         } else if (r != -ENOENT) {
892                 log_error("readlink(%s) failed: %m", p);
893                 return r;
894         }
895
896         if (arg_link_journal == LINK_GUEST) {
897
898                 if (symlink(q, p) < 0) {
899                         log_error("Failed to symlink %s to %s: %m", q, p);
900                         return -errno;
901                 }
902
903                 r = mkdir_p(q, 0755);
904                 if (r < 0)
905                         log_warning("failed to create directory %s: %m", q);
906                 return 0;
907         }
908
909         if (arg_link_journal == LINK_HOST) {
910                 r = mkdir_p(p, 0755);
911                 if (r < 0) {
912                         log_error("Failed to create %s: %m", p);
913                         return r;
914                 }
915
916         } else if (access(p, F_OK) < 0)
917                 return 0;
918
919         if (dir_is_empty(q) == 0) {
920                 log_error("%s not empty.", q);
921                 return -ENOTEMPTY;
922         }
923
924         r = mkdir_p(q, 0755);
925         if (r < 0) {
926                 log_error("Failed to create %s: %m", q);
927                 return r;
928         }
929
930         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
931                 log_error("Failed to bind mount journal from host into guest: %m");
932                 return -errno;
933         }
934
935         return 0;
936 }
937
938 static int setup_kdbus(const char *dest, const char *path) {
939         const char *p;
940
941         if (!path)
942                 return 0;
943
944         p = strappenda(dest, "/dev/kdbus");
945         if (mkdir(p, 0755) < 0) {
946                 log_error("Failed to create kdbus path: %m");
947                 return  -errno;
948         }
949
950         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
951                 log_error("Failed to mount kdbus namespace path: %m");
952                 return -errno;
953         }
954
955         return 0;
956 }
957
958 static int drop_capabilities(void) {
959         return capability_bounding_set_drop(~arg_retain, false);
960 }
961
962 static int register_machine(void) {
963         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
964         _cleanup_bus_unref_ sd_bus *bus = NULL;
965         int r;
966
967         r = sd_bus_open_system(&bus);
968         if (r < 0) {
969                 log_error("Failed to open system bus: %s", strerror(-r));
970                 return r;
971         }
972
973         r = sd_bus_call_method(
974                         bus,
975                         "org.freedesktop.machine1",
976                         "/org/freedesktop/machine1",
977                         "org.freedesktop.machine1.Manager",
978                         "CreateMachine",
979                         &error,
980                         NULL,
981                         "sayssusa(sv)",
982                         arg_machine,
983                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
984                         "nspawn",
985                         "container",
986                         (uint32_t) 0,
987                         strempty(arg_directory),
988                         !isempty(arg_slice), "Slice", "s", arg_slice);
989         if (r < 0) {
990                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
991                 return r;
992         }
993
994         return 0;
995 }
996
997 static int terminate_machine(pid_t pid) {
998         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
999         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1000         _cleanup_bus_unref_ sd_bus *bus = NULL;
1001         const char *path;
1002         int r;
1003
1004         r = sd_bus_default_system(&bus);
1005         if (r < 0) {
1006                 log_error("Failed to open system bus: %s", strerror(-r));
1007                 return r;
1008         }
1009
1010         r = sd_bus_call_method(
1011                         bus,
1012                         "org.freedesktop.machine1",
1013                         "/org/freedesktop/machine1",
1014                         "org.freedesktop.machine1.Manager",
1015                         "GetMachineByPID",
1016                         &error,
1017                         &reply,
1018                         "u",
1019                         (uint32_t) pid);
1020         if (r < 0) {
1021                 /* Note that the machine might already have been
1022                  * cleaned up automatically, hence don't consider it a
1023                  * failure if we cannot get the machine object. */
1024                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1025                 return 0;
1026         }
1027
1028         r = sd_bus_message_read(reply, "o", &path);
1029         if (r < 0)
1030                 return bus_log_parse_error(r);
1031
1032         r = sd_bus_call_method(
1033                         bus,
1034                         "org.freedesktop.machine1",
1035                         path,
1036                         "org.freedesktop.machine1.Machine",
1037                         "Terminate",
1038                         &error,
1039                         NULL,
1040                         NULL);
1041         if (r < 0) {
1042                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1043                 return 0;
1044         }
1045
1046         return 0;
1047 }
1048
1049 static bool audit_enabled(void) {
1050         int fd;
1051
1052         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1053         if (fd >= 0) {
1054                 close_nointr_nofail(fd);
1055                 return true;
1056         }
1057         return false;
1058 }
1059
1060 int main(int argc, char *argv[]) {
1061         pid_t pid = 0;
1062         int r = EXIT_FAILURE, k;
1063         _cleanup_close_ int master = -1, kdbus_fd = -1;
1064         int n_fd_passed;
1065         const char *console = NULL;
1066         sigset_t mask;
1067         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1068         _cleanup_fdset_free_ FDSet *fds = NULL;
1069         _cleanup_free_ char *kdbus_namespace = NULL;
1070         const char *ns;
1071
1072         log_parse_environment();
1073         log_open();
1074
1075         k = parse_argv(argc, argv);
1076         if (k < 0)
1077                 goto finish;
1078         else if (k == 0) {
1079                 r = EXIT_SUCCESS;
1080                 goto finish;
1081         }
1082
1083         if (arg_directory) {
1084                 char *p;
1085
1086                 p = path_make_absolute_cwd(arg_directory);
1087                 free(arg_directory);
1088                 arg_directory = p;
1089         } else
1090                 arg_directory = get_current_dir_name();
1091
1092         if (!arg_directory) {
1093                 log_error("Failed to determine path, please use -D.");
1094                 goto finish;
1095         }
1096
1097         path_kill_slashes(arg_directory);
1098
1099         if (!arg_machine) {
1100                 arg_machine = strdup(basename(arg_directory));
1101                 if (!arg_machine) {
1102                         log_oom();
1103                         goto finish;
1104                 }
1105
1106                 hostname_cleanup(arg_machine, false);
1107                 if (isempty(arg_machine)) {
1108                         log_error("Failed to determine machine name automatically, please use -M.");
1109                         goto finish;
1110                 }
1111         }
1112
1113         if (geteuid() != 0) {
1114                 log_error("Need to be root.");
1115                 goto finish;
1116         }
1117
1118         if (sd_booted() <= 0) {
1119                 log_error("Not running on a systemd system.");
1120                 goto finish;
1121         }
1122
1123         if (arg_boot && audit_enabled()) {
1124                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1125                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1126                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1127                 sleep(5);
1128         }
1129
1130         if (path_equal(arg_directory, "/")) {
1131                 log_error("Spawning container on root directory not supported.");
1132                 goto finish;
1133         }
1134
1135         if (path_is_os_tree(arg_directory) <= 0) {
1136                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1137                 goto finish;
1138         }
1139
1140         log_close();
1141         n_fd_passed = sd_listen_fds(false);
1142         if (n_fd_passed > 0) {
1143                 k = fdset_new_listen_fds(&fds, false);
1144                 if (k < 0) {
1145                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1146                         goto finish;
1147                 }
1148         }
1149         fdset_close_others(fds);
1150         log_open();
1151
1152         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1153         if (master < 0) {
1154                 log_error("Failed to acquire pseudo tty: %m");
1155                 goto finish;
1156         }
1157
1158         console = ptsname(master);
1159         if (!console) {
1160                 log_error("Failed to determine tty name: %m");
1161                 goto finish;
1162         }
1163
1164         log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1165
1166         if (unlockpt(master) < 0) {
1167                 log_error("Failed to unlock tty: %m");
1168                 goto finish;
1169         }
1170
1171         ns = strappenda("machine-", arg_machine);
1172         kdbus_fd = bus_kernel_create_namespace(ns, &kdbus_namespace);
1173         if (r < 0)
1174                 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1175         else
1176                 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1177
1178         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1179                 log_error("Failed to create kmsg socket pair.");
1180                 goto finish;
1181         }
1182
1183         sd_notify(0, "READY=1");
1184
1185         assert_se(sigemptyset(&mask) == 0);
1186         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1187         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1188
1189         for (;;) {
1190                 siginfo_t status;
1191
1192                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1193                 if (pid < 0) {
1194                         if (errno == EINVAL)
1195                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1196                         else
1197                                 log_error("clone() failed: %m");
1198
1199                         goto finish;
1200                 }
1201
1202                 if (pid == 0) {
1203                         /* child */
1204                         const char *home = NULL;
1205                         uid_t uid = (uid_t) -1;
1206                         gid_t gid = (gid_t) -1;
1207                         unsigned n_env = 2;
1208                         const char *envp[] = {
1209                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1210                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1211                                 NULL, /* TERM */
1212                                 NULL, /* HOME */
1213                                 NULL, /* USER */
1214                                 NULL, /* LOGNAME */
1215                                 NULL, /* container_uuid */
1216                                 NULL, /* LISTEN_FDS */
1217                                 NULL, /* LISTEN_PID */
1218                                 NULL
1219                         };
1220
1221                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1222                         if (envp[n_env])
1223                                 n_env ++;
1224
1225                         close_nointr_nofail(master);
1226                         master = -1;
1227
1228                         close_nointr(STDIN_FILENO);
1229                         close_nointr(STDOUT_FILENO);
1230                         close_nointr(STDERR_FILENO);
1231
1232                         close_nointr_nofail(kmsg_socket_pair[0]);
1233                         kmsg_socket_pair[0] = -1;
1234
1235                         reset_all_signal_handlers();
1236
1237                         assert_se(sigemptyset(&mask) == 0);
1238                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1239
1240                         k = open_terminal(console, O_RDWR);
1241                         if (k != STDIN_FILENO) {
1242                                 if (k >= 0) {
1243                                         close_nointr_nofail(k);
1244                                         k = -EINVAL;
1245                                 }
1246
1247                                 log_error("Failed to open console: %s", strerror(-k));
1248                                 goto child_fail;
1249                         }
1250
1251                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1252                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1253                                 log_error("Failed to duplicate console: %m");
1254                                 goto child_fail;
1255                         }
1256
1257                         if (setsid() < 0) {
1258                                 log_error("setsid() failed: %m");
1259                                 goto child_fail;
1260                         }
1261
1262                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1263                                 log_error("PR_SET_PDEATHSIG failed: %m");
1264                                 goto child_fail;
1265                         }
1266
1267                         r = register_machine();
1268                         if (r < 0)
1269                                 goto finish;
1270
1271                         /* Mark everything as slave, so that we still
1272                          * receive mounts from the real root, but don't
1273                          * propagate mounts to the real root. */
1274                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1275                                 log_error("MS_SLAVE|MS_REC failed: %m");
1276                                 goto child_fail;
1277                         }
1278
1279                         /* Turn directory into bind mount */
1280                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1281                                 log_error("Failed to make bind mount.");
1282                                 goto child_fail;
1283                         }
1284
1285                         if (arg_read_only)
1286                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1287                                         log_error("Failed to make read-only.");
1288                                         goto child_fail;
1289                                 }
1290
1291                         if (mount_all(arg_directory) < 0)
1292                                 goto child_fail;
1293
1294                         if (copy_devnodes(arg_directory) < 0)
1295                                 goto child_fail;
1296
1297                         if (setup_ptmx(arg_directory) < 0)
1298                                 goto child_fail;
1299
1300                         dev_setup(arg_directory);
1301
1302                         if (setup_dev_console(arg_directory, console) < 0)
1303                                 goto child_fail;
1304
1305                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1306                                 goto child_fail;
1307
1308                         close_nointr_nofail(kmsg_socket_pair[1]);
1309                         kmsg_socket_pair[1] = -1;
1310
1311                         if (setup_boot_id(arg_directory) < 0)
1312                                 goto child_fail;
1313
1314                         if (setup_timezone(arg_directory) < 0)
1315                                 goto child_fail;
1316
1317                         if (setup_resolv_conf(arg_directory) < 0)
1318                                 goto child_fail;
1319
1320                         if (setup_journal(arg_directory) < 0)
1321                                 goto child_fail;
1322
1323                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1324                                 goto child_fail;
1325
1326                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1327                                 goto child_fail;
1328
1329                         if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1330                                 goto child_fail;
1331
1332                         if (chdir(arg_directory) < 0) {
1333                                 log_error("chdir(%s) failed: %m", arg_directory);
1334                                 goto child_fail;
1335                         }
1336
1337                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1338                                 log_error("mount(MS_MOVE) failed: %m");
1339                                 goto child_fail;
1340                         }
1341
1342                         if (chroot(".") < 0) {
1343                                 log_error("chroot() failed: %m");
1344                                 goto child_fail;
1345                         }
1346
1347                         if (chdir("/") < 0) {
1348                                 log_error("chdir() failed: %m");
1349                                 goto child_fail;
1350                         }
1351
1352                         umask(0022);
1353
1354                         loopback_setup();
1355
1356                         if (drop_capabilities() < 0) {
1357                                 log_error("drop_capabilities() failed: %m");
1358                                 goto child_fail;
1359                         }
1360
1361                         if (arg_user) {
1362
1363                                 /* Note that this resolves user names
1364                                  * inside the container, and hence
1365                                  * accesses the NSS modules from the
1366                                  * container and not the host. This is
1367                                  * a bit weird... */
1368
1369                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1370                                         log_error("get_user_creds() failed: %m");
1371                                         goto child_fail;
1372                                 }
1373
1374                                 if (mkdir_parents_label(home, 0775) < 0) {
1375                                         log_error("mkdir_parents_label() failed: %m");
1376                                         goto child_fail;
1377                                 }
1378
1379                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1380                                         log_error("mkdir_safe_label() failed: %m");
1381                                         goto child_fail;
1382                                 }
1383
1384                                 if (initgroups((const char*)arg_user, gid) < 0) {
1385                                         log_error("initgroups() failed: %m");
1386                                         goto child_fail;
1387                                 }
1388
1389                                 if (setresgid(gid, gid, gid) < 0) {
1390                                         log_error("setregid() failed: %m");
1391                                         goto child_fail;
1392                                 }
1393
1394                                 if (setresuid(uid, uid, uid) < 0) {
1395                                         log_error("setreuid() failed: %m");
1396                                         goto child_fail;
1397                                 }
1398                         } else {
1399                                 /* Reset everything fully to 0, just in case */
1400
1401                                 if (setgroups(0, NULL) < 0) {
1402                                         log_error("setgroups() failed: %m");
1403                                         goto child_fail;
1404                                 }
1405
1406                                 if (setresgid(0, 0, 0) < 0) {
1407                                         log_error("setregid() failed: %m");
1408                                         goto child_fail;
1409                                 }
1410
1411                                 if (setresuid(0, 0, 0) < 0) {
1412                                         log_error("setreuid() failed: %m");
1413                                         goto child_fail;
1414                                 }
1415                         }
1416
1417                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1418                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1419                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1420                                 log_oom();
1421                                 goto child_fail;
1422                         }
1423
1424                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1425                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1426                                         log_oom();
1427                                         goto child_fail;
1428                                 }
1429                         }
1430
1431                         if (fdset_size(fds) > 0) {
1432                                 k = fdset_cloexec(fds, false);
1433                                 if (k < 0) {
1434                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1435                                         goto child_fail;
1436                                 }
1437
1438                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1439                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1440                                         log_oom();
1441                                         goto child_fail;
1442                                 }
1443                         }
1444
1445                         setup_hostname();
1446
1447                         if (arg_boot) {
1448                                 char **a;
1449                                 size_t l;
1450
1451                                 /* Automatically search for the init system */
1452
1453                                 l = 1 + argc - optind;
1454                                 a = newa(char*, l + 1);
1455                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1456
1457                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1458                                 execve(a[0], a, (char**) envp);
1459
1460                                 a[0] = (char*) "/lib/systemd/systemd";
1461                                 execve(a[0], a, (char**) envp);
1462
1463                                 a[0] = (char*) "/sbin/init";
1464                                 execve(a[0], a, (char**) envp);
1465                         } else if (argc > optind)
1466                                 execvpe(argv[optind], argv + optind, (char**) envp);
1467                         else {
1468                                 chdir(home ? home : "/root");
1469                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1470                         }
1471
1472                         log_error("execv() failed: %m");
1473
1474                 child_fail:
1475                         _exit(EXIT_FAILURE);
1476                 }
1477
1478                 fdset_free(fds);
1479                 fds = NULL;
1480
1481                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1482                 if (k < 0) {
1483                         r = EXIT_FAILURE;
1484                         break;
1485                 }
1486
1487                 putc('\n', stdout);
1488
1489                 /* Kill if it is not dead yet anyway */
1490                 terminate_machine(pid);
1491
1492                 /* Redundant, but better safe than sorry */
1493                 kill(pid, SIGKILL);
1494
1495                 k = wait_for_terminate(pid, &status);
1496                 pid = 0;
1497
1498                 if (k < 0) {
1499                         r = EXIT_FAILURE;
1500                         break;
1501                 }
1502
1503                 if (status.si_code == CLD_EXITED) {
1504                         r = status.si_status;
1505                         if (status.si_status != 0) {
1506                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1507                                 break;
1508                         }
1509
1510                         log_debug("Container %s exited successfully.", arg_machine);
1511                         break;
1512                 } else if (status.si_code == CLD_KILLED &&
1513                            status.si_status == SIGINT) {
1514                         log_info("Container %s has been shut down.", arg_machine);
1515                         r = 0;
1516                         break;
1517                 } else if (status.si_code == CLD_KILLED &&
1518                            status.si_status == SIGHUP) {
1519                         log_info("Container %s is being rebooted.", arg_machine);
1520                         continue;
1521                 } else if (status.si_code == CLD_KILLED ||
1522                            status.si_code == CLD_DUMPED) {
1523
1524                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1525                         r = EXIT_FAILURE;
1526                         break;
1527                 } else {
1528                         log_error("Container %s failed due to unknown reason.", arg_machine);
1529                         r = EXIT_FAILURE;
1530                         break;
1531                 }
1532         }
1533
1534 finish:
1535         if (pid > 0)
1536                 kill(pid, SIGKILL);
1537
1538         free(arg_directory);
1539         free(arg_machine);
1540
1541         return r;
1542 }