chiark / gitweb /
nspawn: complain and continue if machine has same id
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43
44 #include "sd-daemon.h"
45 #include "sd-bus.h"
46 #include "sd-id128.h"
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60 #include "fileio.h"
61 #include "bus-util.h"
62 #include "bus-error.h"
63 #include "ptyfwd.h"
64 #include "bus-kernel.h"
65
66 #ifndef TTY_GID
67 #define TTY_GID 5
68 #endif
69
70 typedef enum LinkJournal {
71         LINK_NO,
72         LINK_AUTO,
73         LINK_HOST,
74         LINK_GUEST
75 } LinkJournal;
76
77 static char *arg_directory = NULL;
78 static char *arg_user = NULL;
79 static sd_id128_t arg_uuid = {};
80 static char *arg_machine = NULL;
81 static const char *arg_slice = NULL;
82 static bool arg_private_network = false;
83 static bool arg_read_only = false;
84 static bool arg_boot = false;
85 static LinkJournal arg_link_journal = LINK_AUTO;
86 static uint64_t arg_retain =
87         (1ULL << CAP_CHOWN) |
88         (1ULL << CAP_DAC_OVERRIDE) |
89         (1ULL << CAP_DAC_READ_SEARCH) |
90         (1ULL << CAP_FOWNER) |
91         (1ULL << CAP_FSETID) |
92         (1ULL << CAP_IPC_OWNER) |
93         (1ULL << CAP_KILL) |
94         (1ULL << CAP_LEASE) |
95         (1ULL << CAP_LINUX_IMMUTABLE) |
96         (1ULL << CAP_NET_BIND_SERVICE) |
97         (1ULL << CAP_NET_BROADCAST) |
98         (1ULL << CAP_NET_RAW) |
99         (1ULL << CAP_SETGID) |
100         (1ULL << CAP_SETFCAP) |
101         (1ULL << CAP_SETPCAP) |
102         (1ULL << CAP_SETUID) |
103         (1ULL << CAP_SYS_ADMIN) |
104         (1ULL << CAP_SYS_CHROOT) |
105         (1ULL << CAP_SYS_NICE) |
106         (1ULL << CAP_SYS_PTRACE) |
107         (1ULL << CAP_SYS_TTY_CONFIG) |
108         (1ULL << CAP_SYS_RESOURCE) |
109         (1ULL << CAP_SYS_BOOT) |
110         (1ULL << CAP_AUDIT_WRITE) |
111         (1ULL << CAP_AUDIT_CONTROL);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
114
115 static int help(void) {
116
117         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
118                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
119                "  -h --help                Show this help\n"
120                "     --version             Print version string\n"
121                "  -D --directory=NAME      Root directory for the container\n"
122                "  -b --boot                Boot up full system (i.e. invoke init)\n"
123                "  -u --user=USER           Run the command under specified user or uid\n"
124                "     --uuid=UUID           Set a specific machine UUID for the container\n"
125                "  -M --machine=NAME        Set the machine name for the container\n"
126                "  -S --slice=SLICE         Place the container in the specified slice\n"
127                "     --private-network     Disable network in container\n"
128                "     --read-only           Mount the root directory read-only\n"
129                "     --capability=CAP      In addition to the default, retain specified\n"
130                "                           capability\n"
131                "     --drop-capability=CAP Drop the specified capability from the default set\n"
132                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
133                "  -j                       Equivalent to --link-journal=host\n"
134                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
135                "                           the container\n"
136                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137                program_invocation_short_name);
138
139         return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144         enum {
145                 ARG_VERSION = 0x100,
146                 ARG_PRIVATE_NETWORK,
147                 ARG_UUID,
148                 ARG_READ_ONLY,
149                 ARG_CAPABILITY,
150                 ARG_DROP_CAPABILITY,
151                 ARG_LINK_JOURNAL,
152                 ARG_BIND,
153                 ARG_BIND_RO
154         };
155
156         static const struct option options[] = {
157                 { "help",            no_argument,       NULL, 'h'                 },
158                 { "version",         no_argument,       NULL, ARG_VERSION         },
159                 { "directory",       required_argument, NULL, 'D'                 },
160                 { "user",            required_argument, NULL, 'u'                 },
161                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
162                 { "boot",            no_argument,       NULL, 'b'                 },
163                 { "uuid",            required_argument, NULL, ARG_UUID            },
164                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
165                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
166                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
167                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
168                 { "bind",            required_argument, NULL, ARG_BIND            },
169                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
170                 { "machine",         required_argument, NULL, 'M'                 },
171                 { "slice",           required_argument, NULL, 'S'                 },
172                 {}
173         };
174
175         int c, r;
176
177         assert(argc >= 0);
178         assert(argv);
179
180         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
181
182                 switch (c) {
183
184                 case 'h':
185                         return help();
186
187                 case ARG_VERSION:
188                         puts(PACKAGE_STRING);
189                         puts(SYSTEMD_FEATURES);
190                         return 0;
191
192                 case 'D':
193                         free(arg_directory);
194                         arg_directory = canonicalize_file_name(optarg);
195                         if (!arg_directory) {
196                                 log_error("Invalid root directory: %m");
197                                 return -ENOMEM;
198                         }
199
200                         break;
201
202                 case 'u':
203                         free(arg_user);
204                         arg_user = strdup(optarg);
205                         if (!arg_user)
206                                 return log_oom();
207
208                         break;
209
210                 case ARG_PRIVATE_NETWORK:
211                         arg_private_network = true;
212                         break;
213
214                 case 'b':
215                         arg_boot = true;
216                         break;
217
218                 case ARG_UUID:
219                         r = sd_id128_from_string(optarg, &arg_uuid);
220                         if (r < 0) {
221                                 log_error("Invalid UUID: %s", optarg);
222                                 return r;
223                         }
224                         break;
225
226                 case 'S':
227                         arg_slice = strdup(optarg);
228                         if (!arg_slice)
229                                 return log_oom();
230
231                         break;
232
233                 case 'M':
234                         if (!hostname_is_valid(optarg)) {
235                                 log_error("Invalid machine name: %s", optarg);
236                                 return -EINVAL;
237                         }
238
239                         free(arg_machine);
240                         arg_machine = strdup(optarg);
241                         if (!arg_machine)
242                                 return log_oom();
243
244                         break;
245
246                 case ARG_READ_ONLY:
247                         arg_read_only = true;
248                         break;
249
250                 case ARG_CAPABILITY:
251                 case ARG_DROP_CAPABILITY: {
252                         char *state, *word;
253                         size_t length;
254
255                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
256                                 cap_value_t cap;
257                                 char *t;
258
259                                 t = strndup(word, length);
260                                 if (!t)
261                                         return log_oom();
262
263                                 if (cap_from_name(t, &cap) < 0) {
264                                         log_error("Failed to parse capability %s.", t);
265                                         free(t);
266                                         return -EINVAL;
267                                 }
268
269                                 free(t);
270
271                                 if (c == ARG_CAPABILITY)
272                                         arg_retain |= 1ULL << (uint64_t) cap;
273                                 else
274                                         arg_retain &= ~(1ULL << (uint64_t) cap);
275                         }
276
277                         break;
278                 }
279
280                 case 'j':
281                         arg_link_journal = LINK_GUEST;
282                         break;
283
284                 case ARG_LINK_JOURNAL:
285                         if (streq(optarg, "auto"))
286                                 arg_link_journal = LINK_AUTO;
287                         else if (streq(optarg, "no"))
288                                 arg_link_journal = LINK_NO;
289                         else if (streq(optarg, "guest"))
290                                 arg_link_journal = LINK_GUEST;
291                         else if (streq(optarg, "host"))
292                                 arg_link_journal = LINK_HOST;
293                         else {
294                                 log_error("Failed to parse link journal mode %s", optarg);
295                                 return -EINVAL;
296                         }
297
298                         break;
299
300                 case ARG_BIND:
301                 case ARG_BIND_RO: {
302                         _cleanup_free_ char *a = NULL, *b = NULL;
303                         char *e;
304                         char ***x;
305
306                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
307
308                         e = strchr(optarg, ':');
309                         if (e) {
310                                 a = strndup(optarg, e - optarg);
311                                 b = strdup(e + 1);
312                         } else {
313                                 a = strdup(optarg);
314                                 b = strdup(optarg);
315                         }
316
317                         if (!a || !b)
318                                 return log_oom();
319
320                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
321                                 log_error("Invalid bind mount specification: %s", optarg);
322                                 return -EINVAL;
323                         }
324
325                         r = strv_extend(x, a);
326                         if (r < 0)
327                                 return log_oom();
328
329                         r = strv_extend(x, b);
330                         if (r < 0)
331                                 return log_oom();
332
333                         break;
334                 }
335
336                 case '?':
337                         return -EINVAL;
338
339                 default:
340                         assert_not_reached("Unhandled option");
341                 }
342         }
343
344         return 1;
345 }
346
347 static int mount_all(const char *dest) {
348
349         typedef struct MountPoint {
350                 const char *what;
351                 const char *where;
352                 const char *type;
353                 const char *options;
354                 unsigned long flags;
355                 bool fatal;
356         } MountPoint;
357
358         static const MountPoint mount_table[] = {
359                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
361                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
362                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
363                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
364                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
366                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
367 #ifdef HAVE_SELINUX
368                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
369                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
370 #endif
371         };
372
373         unsigned k;
374         int r = 0;
375
376         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377                 _cleanup_free_ char *where = NULL;
378                 int t;
379
380                 where = strjoin(dest, "/", mount_table[k].where, NULL);
381                 if (!where)
382                         return log_oom();
383
384                 t = path_is_mount_point(where, true);
385                 if (t < 0) {
386                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387
388                         if (r == 0)
389                                 r = t;
390
391                         continue;
392                 }
393
394                 /* Skip this entry if it is not a remount. */
395                 if (mount_table[k].what && t > 0)
396                         continue;
397
398                 mkdir_p(where, 0755);
399
400                 if (mount(mount_table[k].what,
401                           where,
402                           mount_table[k].type,
403                           mount_table[k].flags,
404                           mount_table[k].options) < 0 &&
405                     mount_table[k].fatal) {
406
407                         log_error("mount(%s) failed: %m", where);
408
409                         if (r == 0)
410                                 r = -errno;
411                 }
412         }
413
414         return r;
415 }
416
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
418         char **x, **y;
419
420         STRV_FOREACH_PAIR(x, y, l) {
421                 char *where;
422                 struct stat source_st, dest_st;
423                 int r;
424
425                 if (stat(*x, &source_st) < 0) {
426                         log_error("failed to stat %s: %m", *x);
427                         return -errno;
428                 }
429
430                 where = strappenda(dest, *y);
431                 r = stat(where, &dest_st);
432                 if (r == 0) {
433                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
434                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
435                                                 *x, where);
436                                 return -EINVAL;
437                         }
438                 } else if (errno == ENOENT) {
439                         r = mkdir_parents_label(where, 0755);
440                         if (r < 0) {
441                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
442                                 return r;
443                         }
444                 } else {
445                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
446                         return -errno;
447                 }
448                 /* Create the mount point, but be conservative -- refuse to create block
449                 * and char devices. */
450                 if (S_ISDIR(source_st.st_mode))
451                         mkdir_label(where, 0755);
452                 else if (S_ISFIFO(source_st.st_mode))
453                         mkfifo(where, 0644);
454                 else if (S_ISSOCK(source_st.st_mode))
455                         mknod(where, 0644 | S_IFSOCK, 0);
456                 else if (S_ISREG(source_st.st_mode))
457                         touch(where);
458                 else {
459                         log_error("Refusing to create mountpoint for file: %s", *x);
460                         return -ENOTSUP;
461                 }
462
463                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
464                         log_error("mount(%s) failed: %m", where);
465                         return -errno;
466                 }
467
468                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
469                         log_error("mount(%s) failed: %m", where);
470                         return -errno;
471                 }
472         }
473
474         return 0;
475 }
476
477 static int setup_timezone(const char *dest) {
478         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
479         char *z, *y;
480         int r;
481
482         assert(dest);
483
484         /* Fix the timezone, if possible */
485         r = readlink_malloc("/etc/localtime", &p);
486         if (r < 0) {
487                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
488                 return 0;
489         }
490
491         z = path_startswith(p, "../usr/share/zoneinfo/");
492         if (!z)
493                 z = path_startswith(p, "/usr/share/zoneinfo/");
494         if (!z) {
495                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
496                 return 0;
497         }
498
499         where = strappend(dest, "/etc/localtime");
500         if (!where)
501                 return log_oom();
502
503         r = readlink_malloc(where, &q);
504         if (r >= 0) {
505                 y = path_startswith(q, "../usr/share/zoneinfo/");
506                 if (!y)
507                         y = path_startswith(q, "/usr/share/zoneinfo/");
508
509
510                 /* Already pointing to the right place? Then do nothing .. */
511                 if (y && streq(y, z))
512                         return 0;
513         }
514
515         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
516         if (!check)
517                 return log_oom();
518
519         if (access(check, F_OK) < 0) {
520                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
521                 return 0;
522         }
523
524         what = strappend("../usr/share/zoneinfo/", z);
525         if (!what)
526                 return log_oom();
527
528         unlink(where);
529         if (symlink(what, where) < 0) {
530                 log_error("Failed to correct timezone of container: %m");
531                 return 0;
532         }
533
534         return 0;
535 }
536
537 static int setup_resolv_conf(const char *dest) {
538         char _cleanup_free_ *where = NULL;
539
540         assert(dest);
541
542         if (arg_private_network)
543                 return 0;
544
545         /* Fix resolv.conf, if possible */
546         where = strappend(dest, "/etc/resolv.conf");
547         if (!where)
548                 return log_oom();
549
550         /* We don't really care for the results of this really. If it
551          * fails, it fails, but meh... */
552         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
553
554         return 0;
555 }
556
557 static int setup_boot_id(const char *dest) {
558         _cleanup_free_ char *from = NULL, *to = NULL;
559         sd_id128_t rnd;
560         char as_uuid[37];
561         int r;
562
563         assert(dest);
564
565         /* Generate a new randomized boot ID, so that each boot-up of
566          * the container gets a new one */
567
568         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
569         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
570         if (!from || !to)
571                 return log_oom();
572
573         r = sd_id128_randomize(&rnd);
574         if (r < 0) {
575                 log_error("Failed to generate random boot id: %s", strerror(-r));
576                 return r;
577         }
578
579         snprintf(as_uuid, sizeof(as_uuid),
580                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
581                  SD_ID128_FORMAT_VAL(rnd));
582         char_array_0(as_uuid);
583
584         r = write_string_file(from, as_uuid);
585         if (r < 0) {
586                 log_error("Failed to write boot id: %s", strerror(-r));
587                 return r;
588         }
589
590         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
591                 log_error("Failed to bind mount boot id: %m");
592                 r = -errno;
593         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
594                 log_warning("Failed to make boot id read-only: %m");
595
596         unlink(from);
597         return r;
598 }
599
600 static int copy_devnodes(const char *dest) {
601
602         static const char devnodes[] =
603                 "null\0"
604                 "zero\0"
605                 "full\0"
606                 "random\0"
607                 "urandom\0"
608                 "tty\0";
609
610         const char *d;
611         int r = 0;
612         _cleanup_umask_ mode_t u;
613
614         assert(dest);
615
616         u = umask(0000);
617
618         NULSTR_FOREACH(d, devnodes) {
619                 struct stat st;
620                 _cleanup_free_ char *from = NULL, *to = NULL;
621
622                 asprintf(&from, "/dev/%s", d);
623                 asprintf(&to, "%s/dev/%s", dest, d);
624
625                 if (!from || !to) {
626                         log_oom();
627
628                         if (r == 0)
629                                 r = -ENOMEM;
630
631                         break;
632                 }
633
634                 if (stat(from, &st) < 0) {
635
636                         if (errno != ENOENT) {
637                                 log_error("Failed to stat %s: %m", from);
638                                 if (r == 0)
639                                         r = -errno;
640                         }
641
642                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
643
644                         log_error("%s is not a char or block device, cannot copy", from);
645                         if (r == 0)
646                                 r = -EIO;
647
648                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
649
650                         log_error("mknod(%s) failed: %m", dest);
651                         if (r == 0)
652                                 r = -errno;
653                 }
654         }
655
656         return r;
657 }
658
659 static int setup_ptmx(const char *dest) {
660         _cleanup_free_ char *p = NULL;
661
662         p = strappend(dest, "/dev/ptmx");
663         if (!p)
664                 return log_oom();
665
666         if (symlink("pts/ptmx", p) < 0) {
667                 log_error("Failed to create /dev/ptmx symlink: %m");
668                 return -errno;
669         }
670
671         return 0;
672 }
673
674 static int setup_dev_console(const char *dest, const char *console) {
675         struct stat st;
676         _cleanup_free_ char *to = NULL;
677         int r;
678         _cleanup_umask_ mode_t u;
679
680         assert(dest);
681         assert(console);
682
683         u = umask(0000);
684
685         if (stat(console, &st) < 0) {
686                 log_error("Failed to stat %s: %m", console);
687                 return -errno;
688
689         } else if (!S_ISCHR(st.st_mode)) {
690                 log_error("/dev/console is not a char device");
691                 return -EIO;
692         }
693
694         r = chmod_and_chown(console, 0600, 0, 0);
695         if (r < 0) {
696                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
697                 return r;
698         }
699
700         if (asprintf(&to, "%s/dev/console", dest) < 0)
701                 return log_oom();
702
703         /* We need to bind mount the right tty to /dev/console since
704          * ptys can only exist on pts file systems. To have something
705          * to bind mount things on we create a device node first, that
706          * has the right major/minor (note that the major minor
707          * doesn't actually matter here, since we mount it over
708          * anyway). */
709
710         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
711                 log_error("mknod() for /dev/console failed: %m");
712                 return -errno;
713         }
714
715         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
716                 log_error("Bind mount for /dev/console failed: %m");
717                 return -errno;
718         }
719
720         return 0;
721 }
722
723 static int setup_kmsg(const char *dest, int kmsg_socket) {
724         _cleanup_free_ char *from = NULL, *to = NULL;
725         int r, fd, k;
726         _cleanup_umask_ mode_t u;
727         union {
728                 struct cmsghdr cmsghdr;
729                 uint8_t buf[CMSG_SPACE(sizeof(int))];
730         } control = {};
731         struct msghdr mh = {
732                 .msg_control = &control,
733                 .msg_controllen = sizeof(control),
734         };
735         struct cmsghdr *cmsg;
736
737         assert(dest);
738         assert(kmsg_socket >= 0);
739
740         u = umask(0000);
741
742         /* We create the kmsg FIFO as /dev/kmsg, but immediately
743          * delete it after bind mounting it to /proc/kmsg. While FIFOs
744          * on the reading side behave very similar to /proc/kmsg,
745          * their writing side behaves differently from /dev/kmsg in
746          * that writing blocks when nothing is reading. In order to
747          * avoid any problems with containers deadlocking due to this
748          * we simply make /dev/kmsg unavailable to the container. */
749         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
750             asprintf(&to, "%s/proc/kmsg", dest) < 0)
751                 return log_oom();
752
753         if (mkfifo(from, 0600) < 0) {
754                 log_error("mkfifo() for /dev/kmsg failed: %m");
755                 return -errno;
756         }
757
758         r = chmod_and_chown(from, 0600, 0, 0);
759         if (r < 0) {
760                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
761                 return r;
762         }
763
764         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
765                 log_error("Bind mount for /proc/kmsg failed: %m");
766                 return -errno;
767         }
768
769         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
770         if (fd < 0) {
771                 log_error("Failed to open fifo: %m");
772                 return -errno;
773         }
774
775         cmsg = CMSG_FIRSTHDR(&mh);
776         cmsg->cmsg_level = SOL_SOCKET;
777         cmsg->cmsg_type = SCM_RIGHTS;
778         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
779         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
780
781         mh.msg_controllen = cmsg->cmsg_len;
782
783         /* Store away the fd in the socket, so that it stays open as
784          * long as we run the child */
785         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
786         close_nointr_nofail(fd);
787
788         if (k < 0) {
789                 log_error("Failed to send FIFO fd: %m");
790                 return -errno;
791         }
792
793         /* And now make the FIFO unavailable as /dev/kmsg... */
794         unlink(from);
795         return 0;
796 }
797
798 static int setup_hostname(void) {
799
800         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
801                 return -errno;
802
803         return 0;
804 }
805
806 static int setup_journal(const char *directory) {
807         sd_id128_t machine_id, this_id;
808         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
809         char *id;
810         int r;
811
812         p = strappend(directory, "/etc/machine-id");
813         if (!p)
814                 return log_oom();
815
816         r = read_one_line_file(p, &b);
817         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
818                 return 0;
819         else if (r < 0) {
820                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
821                 return r;
822         }
823
824         id = strstrip(b);
825         if (isempty(id) && arg_link_journal == LINK_AUTO)
826                 return 0;
827
828         /* Verify validity */
829         r = sd_id128_from_string(id, &machine_id);
830         if (r < 0) {
831                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
832                 return r;
833         }
834
835         r = sd_id128_get_machine(&this_id);
836         if (r < 0) {
837                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
838                 return r;
839         }
840
841         if (sd_id128_equal(machine_id, this_id)) {
842                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
843                          "Host and machine ids are equal (%s): refusing to link journals", id);
844                 if (arg_link_journal == LINK_AUTO)
845                         return 0;
846                 return
847                         -EEXIST;
848         }
849
850         if (arg_link_journal == LINK_NO)
851                 return 0;
852
853         free(p);
854         p = strappend("/var/log/journal/", id);
855         q = strjoin(directory, "/var/log/journal/", id, NULL);
856         if (!p || !q)
857                 return log_oom();
858
859         if (path_is_mount_point(p, false) > 0) {
860                 if (arg_link_journal != LINK_AUTO) {
861                         log_error("%s: already a mount point, refusing to use for journal", p);
862                         return -EEXIST;
863                 }
864
865                 return 0;
866         }
867
868         if (path_is_mount_point(q, false) > 0) {
869                 if (arg_link_journal != LINK_AUTO) {
870                         log_error("%s: already a mount point, refusing to use for journal", q);
871                         return -EEXIST;
872                 }
873
874                 return 0;
875         }
876
877         r = readlink_and_make_absolute(p, &d);
878         if (r >= 0) {
879                 if ((arg_link_journal == LINK_GUEST ||
880                      arg_link_journal == LINK_AUTO) &&
881                     path_equal(d, q)) {
882
883                         r = mkdir_p(q, 0755);
884                         if (r < 0)
885                                 log_warning("failed to create directory %s: %m", q);
886                         return 0;
887                 }
888
889                 if (unlink(p) < 0) {
890                         log_error("Failed to remove symlink %s: %m", p);
891                         return -errno;
892                 }
893         } else if (r == -EINVAL) {
894
895                 if (arg_link_journal == LINK_GUEST &&
896                     rmdir(p) < 0) {
897
898                         if (errno == ENOTDIR) {
899                                 log_error("%s already exists and is neither a symlink nor a directory", p);
900                                 return r;
901                         } else {
902                                 log_error("Failed to remove %s: %m", p);
903                                 return -errno;
904                         }
905                 }
906         } else if (r != -ENOENT) {
907                 log_error("readlink(%s) failed: %m", p);
908                 return r;
909         }
910
911         if (arg_link_journal == LINK_GUEST) {
912
913                 if (symlink(q, p) < 0) {
914                         log_error("Failed to symlink %s to %s: %m", q, p);
915                         return -errno;
916                 }
917
918                 r = mkdir_p(q, 0755);
919                 if (r < 0)
920                         log_warning("failed to create directory %s: %m", q);
921                 return 0;
922         }
923
924         if (arg_link_journal == LINK_HOST) {
925                 r = mkdir_p(p, 0755);
926                 if (r < 0) {
927                         log_error("Failed to create %s: %m", p);
928                         return r;
929                 }
930
931         } else if (access(p, F_OK) < 0)
932                 return 0;
933
934         if (dir_is_empty(q) == 0) {
935                 log_error("%s not empty.", q);
936                 return -ENOTEMPTY;
937         }
938
939         r = mkdir_p(q, 0755);
940         if (r < 0) {
941                 log_error("Failed to create %s: %m", q);
942                 return r;
943         }
944
945         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
946                 log_error("Failed to bind mount journal from host into guest: %m");
947                 return -errno;
948         }
949
950         return 0;
951 }
952
953 static int setup_kdbus(const char *dest, const char *path) {
954         const char *p;
955
956         if (!path)
957                 return 0;
958
959         p = strappenda(dest, "/dev/kdbus");
960         if (mkdir(p, 0755) < 0) {
961                 log_error("Failed to create kdbus path: %m");
962                 return  -errno;
963         }
964
965         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
966                 log_error("Failed to mount kdbus namespace path: %m");
967                 return -errno;
968         }
969
970         return 0;
971 }
972
973 static int drop_capabilities(void) {
974         return capability_bounding_set_drop(~arg_retain, false);
975 }
976
977 static int register_machine(void) {
978         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
979         _cleanup_bus_unref_ sd_bus *bus = NULL;
980         int r;
981
982         r = sd_bus_open_system(&bus);
983         if (r < 0) {
984                 log_error("Failed to open system bus: %s", strerror(-r));
985                 return r;
986         }
987
988         r = sd_bus_call_method(
989                         bus,
990                         "org.freedesktop.machine1",
991                         "/org/freedesktop/machine1",
992                         "org.freedesktop.machine1.Manager",
993                         "CreateMachine",
994                         &error,
995                         NULL,
996                         "sayssusa(sv)",
997                         arg_machine,
998                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
999                         "nspawn",
1000                         "container",
1001                         (uint32_t) 0,
1002                         strempty(arg_directory),
1003                         !isempty(arg_slice), "Slice", "s", arg_slice);
1004         if (r < 0) {
1005                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1006                 return r;
1007         }
1008
1009         return 0;
1010 }
1011
1012 static int terminate_machine(pid_t pid) {
1013         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1014         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1015         _cleanup_bus_unref_ sd_bus *bus = NULL;
1016         const char *path;
1017         int r;
1018
1019         r = sd_bus_default_system(&bus);
1020         if (r < 0) {
1021                 log_error("Failed to open system bus: %s", strerror(-r));
1022                 return r;
1023         }
1024
1025         r = sd_bus_call_method(
1026                         bus,
1027                         "org.freedesktop.machine1",
1028                         "/org/freedesktop/machine1",
1029                         "org.freedesktop.machine1.Manager",
1030                         "GetMachineByPID",
1031                         &error,
1032                         &reply,
1033                         "u",
1034                         (uint32_t) pid);
1035         if (r < 0) {
1036                 /* Note that the machine might already have been
1037                  * cleaned up automatically, hence don't consider it a
1038                  * failure if we cannot get the machine object. */
1039                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1040                 return 0;
1041         }
1042
1043         r = sd_bus_message_read(reply, "o", &path);
1044         if (r < 0)
1045                 return bus_log_parse_error(r);
1046
1047         r = sd_bus_call_method(
1048                         bus,
1049                         "org.freedesktop.machine1",
1050                         path,
1051                         "org.freedesktop.machine1.Machine",
1052                         "Terminate",
1053                         &error,
1054                         NULL,
1055                         NULL);
1056         if (r < 0) {
1057                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1058                 return 0;
1059         }
1060
1061         return 0;
1062 }
1063
1064 static bool audit_enabled(void) {
1065         int fd;
1066
1067         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1068         if (fd >= 0) {
1069                 close_nointr_nofail(fd);
1070                 return true;
1071         }
1072         return false;
1073 }
1074
1075 int main(int argc, char *argv[]) {
1076         pid_t pid = 0;
1077         int r = EXIT_FAILURE, k;
1078         _cleanup_close_ int master = -1, kdbus_fd = -1;
1079         int n_fd_passed;
1080         const char *console = NULL;
1081         sigset_t mask;
1082         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1083         _cleanup_fdset_free_ FDSet *fds = NULL;
1084         _cleanup_free_ char *kdbus_namespace = NULL;
1085         const char *ns;
1086
1087         log_parse_environment();
1088         log_open();
1089
1090         k = parse_argv(argc, argv);
1091         if (k < 0)
1092                 goto finish;
1093         else if (k == 0) {
1094                 r = EXIT_SUCCESS;
1095                 goto finish;
1096         }
1097
1098         if (arg_directory) {
1099                 char *p;
1100
1101                 p = path_make_absolute_cwd(arg_directory);
1102                 free(arg_directory);
1103                 arg_directory = p;
1104         } else
1105                 arg_directory = get_current_dir_name();
1106
1107         if (!arg_directory) {
1108                 log_error("Failed to determine path, please use -D.");
1109                 goto finish;
1110         }
1111
1112         path_kill_slashes(arg_directory);
1113
1114         if (!arg_machine) {
1115                 arg_machine = strdup(basename(arg_directory));
1116                 if (!arg_machine) {
1117                         log_oom();
1118                         goto finish;
1119                 }
1120
1121                 hostname_cleanup(arg_machine, false);
1122                 if (isempty(arg_machine)) {
1123                         log_error("Failed to determine machine name automatically, please use -M.");
1124                         goto finish;
1125                 }
1126         }
1127
1128         if (geteuid() != 0) {
1129                 log_error("Need to be root.");
1130                 goto finish;
1131         }
1132
1133         if (sd_booted() <= 0) {
1134                 log_error("Not running on a systemd system.");
1135                 goto finish;
1136         }
1137
1138         if (arg_boot && audit_enabled()) {
1139                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1140                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1141                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1142                 sleep(5);
1143         }
1144
1145         if (path_equal(arg_directory, "/")) {
1146                 log_error("Spawning container on root directory not supported.");
1147                 goto finish;
1148         }
1149
1150         if (path_is_os_tree(arg_directory) <= 0) {
1151                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1152                 goto finish;
1153         }
1154
1155         log_close();
1156         n_fd_passed = sd_listen_fds(false);
1157         if (n_fd_passed > 0) {
1158                 k = fdset_new_listen_fds(&fds, false);
1159                 if (k < 0) {
1160                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1161                         goto finish;
1162                 }
1163         }
1164         fdset_close_others(fds);
1165         log_open();
1166
1167         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1168         if (master < 0) {
1169                 log_error("Failed to acquire pseudo tty: %m");
1170                 goto finish;
1171         }
1172
1173         console = ptsname(master);
1174         if (!console) {
1175                 log_error("Failed to determine tty name: %m");
1176                 goto finish;
1177         }
1178
1179         log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1180
1181         if (unlockpt(master) < 0) {
1182                 log_error("Failed to unlock tty: %m");
1183                 goto finish;
1184         }
1185
1186         ns = strappenda("machine-", arg_machine);
1187         kdbus_fd = bus_kernel_create_namespace(ns, &kdbus_namespace);
1188         if (r < 0)
1189                 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1190         else
1191                 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1192
1193         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1194                 log_error("Failed to create kmsg socket pair.");
1195                 goto finish;
1196         }
1197
1198         sd_notify(0, "READY=1");
1199
1200         assert_se(sigemptyset(&mask) == 0);
1201         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1202         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1203
1204         for (;;) {
1205                 siginfo_t status;
1206
1207                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1208                 if (pid < 0) {
1209                         if (errno == EINVAL)
1210                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1211                         else
1212                                 log_error("clone() failed: %m");
1213
1214                         goto finish;
1215                 }
1216
1217                 if (pid == 0) {
1218                         /* child */
1219                         const char *home = NULL;
1220                         uid_t uid = (uid_t) -1;
1221                         gid_t gid = (gid_t) -1;
1222                         unsigned n_env = 2;
1223                         const char *envp[] = {
1224                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1225                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1226                                 NULL, /* TERM */
1227                                 NULL, /* HOME */
1228                                 NULL, /* USER */
1229                                 NULL, /* LOGNAME */
1230                                 NULL, /* container_uuid */
1231                                 NULL, /* LISTEN_FDS */
1232                                 NULL, /* LISTEN_PID */
1233                                 NULL
1234                         };
1235
1236                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1237                         if (envp[n_env])
1238                                 n_env ++;
1239
1240                         close_nointr_nofail(master);
1241                         master = -1;
1242
1243                         close_nointr(STDIN_FILENO);
1244                         close_nointr(STDOUT_FILENO);
1245                         close_nointr(STDERR_FILENO);
1246
1247                         close_nointr_nofail(kmsg_socket_pair[0]);
1248                         kmsg_socket_pair[0] = -1;
1249
1250                         reset_all_signal_handlers();
1251
1252                         assert_se(sigemptyset(&mask) == 0);
1253                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1254
1255                         k = open_terminal(console, O_RDWR);
1256                         if (k != STDIN_FILENO) {
1257                                 if (k >= 0) {
1258                                         close_nointr_nofail(k);
1259                                         k = -EINVAL;
1260                                 }
1261
1262                                 log_error("Failed to open console: %s", strerror(-k));
1263                                 goto child_fail;
1264                         }
1265
1266                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1267                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1268                                 log_error("Failed to duplicate console: %m");
1269                                 goto child_fail;
1270                         }
1271
1272                         if (setsid() < 0) {
1273                                 log_error("setsid() failed: %m");
1274                                 goto child_fail;
1275                         }
1276
1277                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1278                                 log_error("PR_SET_PDEATHSIG failed: %m");
1279                                 goto child_fail;
1280                         }
1281
1282                         r = register_machine();
1283                         if (r < 0)
1284                                 goto finish;
1285
1286                         /* Mark everything as slave, so that we still
1287                          * receive mounts from the real root, but don't
1288                          * propagate mounts to the real root. */
1289                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1290                                 log_error("MS_SLAVE|MS_REC failed: %m");
1291                                 goto child_fail;
1292                         }
1293
1294                         /* Turn directory into bind mount */
1295                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1296                                 log_error("Failed to make bind mount.");
1297                                 goto child_fail;
1298                         }
1299
1300                         if (arg_read_only)
1301                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1302                                         log_error("Failed to make read-only.");
1303                                         goto child_fail;
1304                                 }
1305
1306                         if (mount_all(arg_directory) < 0)
1307                                 goto child_fail;
1308
1309                         if (copy_devnodes(arg_directory) < 0)
1310                                 goto child_fail;
1311
1312                         if (setup_ptmx(arg_directory) < 0)
1313                                 goto child_fail;
1314
1315                         dev_setup(arg_directory);
1316
1317                         if (setup_dev_console(arg_directory, console) < 0)
1318                                 goto child_fail;
1319
1320                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1321                                 goto child_fail;
1322
1323                         close_nointr_nofail(kmsg_socket_pair[1]);
1324                         kmsg_socket_pair[1] = -1;
1325
1326                         if (setup_boot_id(arg_directory) < 0)
1327                                 goto child_fail;
1328
1329                         if (setup_timezone(arg_directory) < 0)
1330                                 goto child_fail;
1331
1332                         if (setup_resolv_conf(arg_directory) < 0)
1333                                 goto child_fail;
1334
1335                         if (setup_journal(arg_directory) < 0)
1336                                 goto child_fail;
1337
1338                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1339                                 goto child_fail;
1340
1341                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1342                                 goto child_fail;
1343
1344                         if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1345                                 goto child_fail;
1346
1347                         if (chdir(arg_directory) < 0) {
1348                                 log_error("chdir(%s) failed: %m", arg_directory);
1349                                 goto child_fail;
1350                         }
1351
1352                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1353                                 log_error("mount(MS_MOVE) failed: %m");
1354                                 goto child_fail;
1355                         }
1356
1357                         if (chroot(".") < 0) {
1358                                 log_error("chroot() failed: %m");
1359                                 goto child_fail;
1360                         }
1361
1362                         if (chdir("/") < 0) {
1363                                 log_error("chdir() failed: %m");
1364                                 goto child_fail;
1365                         }
1366
1367                         umask(0022);
1368
1369                         loopback_setup();
1370
1371                         if (drop_capabilities() < 0) {
1372                                 log_error("drop_capabilities() failed: %m");
1373                                 goto child_fail;
1374                         }
1375
1376                         if (arg_user) {
1377
1378                                 /* Note that this resolves user names
1379                                  * inside the container, and hence
1380                                  * accesses the NSS modules from the
1381                                  * container and not the host. This is
1382                                  * a bit weird... */
1383
1384                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1385                                         log_error("get_user_creds() failed: %m");
1386                                         goto child_fail;
1387                                 }
1388
1389                                 if (mkdir_parents_label(home, 0775) < 0) {
1390                                         log_error("mkdir_parents_label() failed: %m");
1391                                         goto child_fail;
1392                                 }
1393
1394                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1395                                         log_error("mkdir_safe_label() failed: %m");
1396                                         goto child_fail;
1397                                 }
1398
1399                                 if (initgroups((const char*)arg_user, gid) < 0) {
1400                                         log_error("initgroups() failed: %m");
1401                                         goto child_fail;
1402                                 }
1403
1404                                 if (setresgid(gid, gid, gid) < 0) {
1405                                         log_error("setregid() failed: %m");
1406                                         goto child_fail;
1407                                 }
1408
1409                                 if (setresuid(uid, uid, uid) < 0) {
1410                                         log_error("setreuid() failed: %m");
1411                                         goto child_fail;
1412                                 }
1413                         } else {
1414                                 /* Reset everything fully to 0, just in case */
1415
1416                                 if (setgroups(0, NULL) < 0) {
1417                                         log_error("setgroups() failed: %m");
1418                                         goto child_fail;
1419                                 }
1420
1421                                 if (setresgid(0, 0, 0) < 0) {
1422                                         log_error("setregid() failed: %m");
1423                                         goto child_fail;
1424                                 }
1425
1426                                 if (setresuid(0, 0, 0) < 0) {
1427                                         log_error("setreuid() failed: %m");
1428                                         goto child_fail;
1429                                 }
1430                         }
1431
1432                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1433                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1434                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1435                                 log_oom();
1436                                 goto child_fail;
1437                         }
1438
1439                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1440                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1441                                         log_oom();
1442                                         goto child_fail;
1443                                 }
1444                         }
1445
1446                         if (fdset_size(fds) > 0) {
1447                                 k = fdset_cloexec(fds, false);
1448                                 if (k < 0) {
1449                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1450                                         goto child_fail;
1451                                 }
1452
1453                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1454                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1455                                         log_oom();
1456                                         goto child_fail;
1457                                 }
1458                         }
1459
1460                         setup_hostname();
1461
1462                         if (arg_boot) {
1463                                 char **a;
1464                                 size_t l;
1465
1466                                 /* Automatically search for the init system */
1467
1468                                 l = 1 + argc - optind;
1469                                 a = newa(char*, l + 1);
1470                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1471
1472                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1473                                 execve(a[0], a, (char**) envp);
1474
1475                                 a[0] = (char*) "/lib/systemd/systemd";
1476                                 execve(a[0], a, (char**) envp);
1477
1478                                 a[0] = (char*) "/sbin/init";
1479                                 execve(a[0], a, (char**) envp);
1480                         } else if (argc > optind)
1481                                 execvpe(argv[optind], argv + optind, (char**) envp);
1482                         else {
1483                                 chdir(home ? home : "/root");
1484                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1485                         }
1486
1487                         log_error("execv() failed: %m");
1488
1489                 child_fail:
1490                         _exit(EXIT_FAILURE);
1491                 }
1492
1493                 fdset_free(fds);
1494                 fds = NULL;
1495
1496                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1497                 if (k < 0) {
1498                         r = EXIT_FAILURE;
1499                         break;
1500                 }
1501
1502                 putc('\n', stdout);
1503
1504                 /* Kill if it is not dead yet anyway */
1505                 terminate_machine(pid);
1506
1507                 /* Redundant, but better safe than sorry */
1508                 kill(pid, SIGKILL);
1509
1510                 k = wait_for_terminate(pid, &status);
1511                 pid = 0;
1512
1513                 if (k < 0) {
1514                         r = EXIT_FAILURE;
1515                         break;
1516                 }
1517
1518                 if (status.si_code == CLD_EXITED) {
1519                         r = status.si_status;
1520                         if (status.si_status != 0) {
1521                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1522                                 break;
1523                         }
1524
1525                         log_debug("Container %s exited successfully.", arg_machine);
1526                         break;
1527                 } else if (status.si_code == CLD_KILLED &&
1528                            status.si_status == SIGINT) {
1529                         log_info("Container %s has been shut down.", arg_machine);
1530                         r = 0;
1531                         break;
1532                 } else if (status.si_code == CLD_KILLED &&
1533                            status.si_status == SIGHUP) {
1534                         log_info("Container %s is being rebooted.", arg_machine);
1535                         continue;
1536                 } else if (status.si_code == CLD_KILLED ||
1537                            status.si_code == CLD_DUMPED) {
1538
1539                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1540                         r = EXIT_FAILURE;
1541                         break;
1542                 } else {
1543                         log_error("Container %s failed due to unknown reason.", arg_machine);
1544                         r = EXIT_FAILURE;
1545                         break;
1546                 }
1547         }
1548
1549 finish:
1550         if (pid > 0)
1551                 kill(pid, SIGKILL);
1552
1553         free(arg_directory);
1554         free(arg_machine);
1555
1556         return r;
1557 }