chiark / gitweb /
Get rid of our reimplementation of basename
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43
44 #include "sd-daemon.h"
45 #include "sd-bus.h"
46 #include "sd-id128.h"
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60 #include "fileio.h"
61 #include "bus-util.h"
62 #include "bus-error.h"
63 #include "ptyfwd.h"
64 #include "bus-kernel.h"
65
66 #ifndef TTY_GID
67 #define TTY_GID 5
68 #endif
69
70 typedef enum LinkJournal {
71         LINK_NO,
72         LINK_AUTO,
73         LINK_HOST,
74         LINK_GUEST
75 } LinkJournal;
76
77 static char *arg_directory = NULL;
78 static char *arg_user = NULL;
79 static sd_id128_t arg_uuid = {};
80 static char *arg_machine = NULL;
81 static const char *arg_slice = NULL;
82 static bool arg_private_network = false;
83 static bool arg_read_only = false;
84 static bool arg_boot = false;
85 static LinkJournal arg_link_journal = LINK_AUTO;
86 static uint64_t arg_retain =
87         (1ULL << CAP_CHOWN) |
88         (1ULL << CAP_DAC_OVERRIDE) |
89         (1ULL << CAP_DAC_READ_SEARCH) |
90         (1ULL << CAP_FOWNER) |
91         (1ULL << CAP_FSETID) |
92         (1ULL << CAP_IPC_OWNER) |
93         (1ULL << CAP_KILL) |
94         (1ULL << CAP_LEASE) |
95         (1ULL << CAP_LINUX_IMMUTABLE) |
96         (1ULL << CAP_NET_BIND_SERVICE) |
97         (1ULL << CAP_NET_BROADCAST) |
98         (1ULL << CAP_NET_RAW) |
99         (1ULL << CAP_SETGID) |
100         (1ULL << CAP_SETFCAP) |
101         (1ULL << CAP_SETPCAP) |
102         (1ULL << CAP_SETUID) |
103         (1ULL << CAP_SYS_ADMIN) |
104         (1ULL << CAP_SYS_CHROOT) |
105         (1ULL << CAP_SYS_NICE) |
106         (1ULL << CAP_SYS_PTRACE) |
107         (1ULL << CAP_SYS_TTY_CONFIG) |
108         (1ULL << CAP_SYS_RESOURCE) |
109         (1ULL << CAP_SYS_BOOT) |
110         (1ULL << CAP_AUDIT_WRITE) |
111         (1ULL << CAP_AUDIT_CONTROL);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
114
115 static int help(void) {
116
117         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
118                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
119                "  -h --help                Show this help\n"
120                "     --version             Print version string\n"
121                "  -D --directory=NAME      Root directory for the container\n"
122                "  -b --boot                Boot up full system (i.e. invoke init)\n"
123                "  -u --user=USER           Run the command under specified user or uid\n"
124                "     --uuid=UUID           Set a specific machine UUID for the container\n"
125                "  -M --machine=NAME        Set the machine name for the container\n"
126                "  -S --slice=SLICE         Place the container in the specified slice\n"
127                "     --private-network     Disable network in container\n"
128                "     --read-only           Mount the root directory read-only\n"
129                "     --capability=CAP      In addition to the default, retain specified\n"
130                "                           capability\n"
131                "     --drop-capability=CAP Drop the specified capability from the default set\n"
132                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
133                "  -j                       Equivalent to --link-journal=host\n"
134                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
135                "                           the container\n"
136                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137                program_invocation_short_name);
138
139         return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144         enum {
145                 ARG_VERSION = 0x100,
146                 ARG_PRIVATE_NETWORK,
147                 ARG_UUID,
148                 ARG_READ_ONLY,
149                 ARG_CAPABILITY,
150                 ARG_DROP_CAPABILITY,
151                 ARG_LINK_JOURNAL,
152                 ARG_BIND,
153                 ARG_BIND_RO
154         };
155
156         static const struct option options[] = {
157                 { "help",            no_argument,       NULL, 'h'                 },
158                 { "version",         no_argument,       NULL, ARG_VERSION         },
159                 { "directory",       required_argument, NULL, 'D'                 },
160                 { "user",            required_argument, NULL, 'u'                 },
161                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
162                 { "boot",            no_argument,       NULL, 'b'                 },
163                 { "uuid",            required_argument, NULL, ARG_UUID            },
164                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
165                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
166                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
167                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
168                 { "bind",            required_argument, NULL, ARG_BIND            },
169                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
170                 { "machine",         required_argument, NULL, 'M'                 },
171                 { "slice",           required_argument, NULL, 'S'                 },
172                 {}
173         };
174
175         int c, r;
176
177         assert(argc >= 0);
178         assert(argv);
179
180         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
181
182                 switch (c) {
183
184                 case 'h':
185                         return help();
186
187                 case ARG_VERSION:
188                         puts(PACKAGE_STRING);
189                         puts(SYSTEMD_FEATURES);
190                         return 0;
191
192                 case 'D':
193                         free(arg_directory);
194                         arg_directory = canonicalize_file_name(optarg);
195                         if (!arg_directory) {
196                                 log_error("Invalid root directory: %m");
197                                 return -ENOMEM;
198                         }
199
200                         break;
201
202                 case 'u':
203                         free(arg_user);
204                         arg_user = strdup(optarg);
205                         if (!arg_user)
206                                 return log_oom();
207
208                         break;
209
210                 case ARG_PRIVATE_NETWORK:
211                         arg_private_network = true;
212                         break;
213
214                 case 'b':
215                         arg_boot = true;
216                         break;
217
218                 case ARG_UUID:
219                         r = sd_id128_from_string(optarg, &arg_uuid);
220                         if (r < 0) {
221                                 log_error("Invalid UUID: %s", optarg);
222                                 return r;
223                         }
224                         break;
225
226                 case 'S':
227                         arg_slice = strdup(optarg);
228                         if (!arg_slice)
229                                 return log_oom();
230
231                         break;
232
233                 case 'M':
234                         if (!hostname_is_valid(optarg)) {
235                                 log_error("Invalid machine name: %s", optarg);
236                                 return -EINVAL;
237                         }
238
239                         free(arg_machine);
240                         arg_machine = strdup(optarg);
241                         if (!arg_machine)
242                                 return log_oom();
243
244                         break;
245
246                 case ARG_READ_ONLY:
247                         arg_read_only = true;
248                         break;
249
250                 case ARG_CAPABILITY:
251                 case ARG_DROP_CAPABILITY: {
252                         char *state, *word;
253                         size_t length;
254
255                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
256                                 cap_value_t cap;
257                                 char *t;
258
259                                 t = strndup(word, length);
260                                 if (!t)
261                                         return log_oom();
262
263                                 if (cap_from_name(t, &cap) < 0) {
264                                         log_error("Failed to parse capability %s.", t);
265                                         free(t);
266                                         return -EINVAL;
267                                 }
268
269                                 free(t);
270
271                                 if (c == ARG_CAPABILITY)
272                                         arg_retain |= 1ULL << (uint64_t) cap;
273                                 else
274                                         arg_retain &= ~(1ULL << (uint64_t) cap);
275                         }
276
277                         break;
278                 }
279
280                 case 'j':
281                         arg_link_journal = LINK_GUEST;
282                         break;
283
284                 case ARG_LINK_JOURNAL:
285                         if (streq(optarg, "auto"))
286                                 arg_link_journal = LINK_AUTO;
287                         else if (streq(optarg, "no"))
288                                 arg_link_journal = LINK_NO;
289                         else if (streq(optarg, "guest"))
290                                 arg_link_journal = LINK_GUEST;
291                         else if (streq(optarg, "host"))
292                                 arg_link_journal = LINK_HOST;
293                         else {
294                                 log_error("Failed to parse link journal mode %s", optarg);
295                                 return -EINVAL;
296                         }
297
298                         break;
299
300                 case ARG_BIND:
301                 case ARG_BIND_RO: {
302                         _cleanup_free_ char *a = NULL, *b = NULL;
303                         char *e;
304                         char ***x;
305
306                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
307
308                         e = strchr(optarg, ':');
309                         if (e) {
310                                 a = strndup(optarg, e - optarg);
311                                 b = strdup(e + 1);
312                         } else {
313                                 a = strdup(optarg);
314                                 b = strdup(optarg);
315                         }
316
317                         if (!a || !b)
318                                 return log_oom();
319
320                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
321                                 log_error("Invalid bind mount specification: %s", optarg);
322                                 return -EINVAL;
323                         }
324
325                         r = strv_extend(x, a);
326                         if (r < 0)
327                                 return log_oom();
328
329                         r = strv_extend(x, b);
330                         if (r < 0)
331                                 return log_oom();
332
333                         break;
334                 }
335
336                 case '?':
337                         return -EINVAL;
338
339                 default:
340                         assert_not_reached("Unhandled option");
341                 }
342         }
343
344         return 1;
345 }
346
347 static int mount_all(const char *dest) {
348
349         typedef struct MountPoint {
350                 const char *what;
351                 const char *where;
352                 const char *type;
353                 const char *options;
354                 unsigned long flags;
355                 bool fatal;
356         } MountPoint;
357
358         static const MountPoint mount_table[] = {
359                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
361                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
362                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
363                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
364                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
366                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
367 #ifdef HAVE_SELINUX
368                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
369                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
370 #endif
371         };
372
373         unsigned k;
374         int r = 0;
375
376         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377                 _cleanup_free_ char *where = NULL;
378                 int t;
379
380                 where = strjoin(dest, "/", mount_table[k].where, NULL);
381                 if (!where)
382                         return log_oom();
383
384                 t = path_is_mount_point(where, true);
385                 if (t < 0) {
386                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387
388                         if (r == 0)
389                                 r = t;
390
391                         continue;
392                 }
393
394                 /* Skip this entry if it is not a remount. */
395                 if (mount_table[k].what && t > 0)
396                         continue;
397
398                 mkdir_p(where, 0755);
399
400                 if (mount(mount_table[k].what,
401                           where,
402                           mount_table[k].type,
403                           mount_table[k].flags,
404                           mount_table[k].options) < 0 &&
405                     mount_table[k].fatal) {
406
407                         log_error("mount(%s) failed: %m", where);
408
409                         if (r == 0)
410                                 r = -errno;
411                 }
412         }
413
414         return r;
415 }
416
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
418         char **x, **y;
419
420         STRV_FOREACH_PAIR(x, y, l) {
421                 char *where;
422                 struct stat source_st, dest_st;
423                 int r;
424
425                 if (stat(*x, &source_st) < 0) {
426                         log_error("failed to stat %s: %m", *x);
427                         return -errno;
428                 }
429
430                 where = strappenda(dest, *y);
431                 r = stat(where, &dest_st);
432                 if (r == 0) {
433                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
434                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
435                                                 *x, where);
436                                 return -EINVAL;
437                         }
438                 } else if (errno == ENOENT) {
439                         r = mkdir_parents_label(where, 0755);
440                         if (r < 0) {
441                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
442                                 return r;
443                         }
444                 } else {
445                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
446                         return -errno;
447                 }
448                 /* Create the mount point, but be conservative -- refuse to create block
449                 * and char devices. */
450                 if (S_ISDIR(source_st.st_mode))
451                         mkdir_label(where, 0755);
452                 else if (S_ISFIFO(source_st.st_mode))
453                         mkfifo(where, 0644);
454                 else if (S_ISSOCK(source_st.st_mode))
455                         mknod(where, 0644 | S_IFSOCK, 0);
456                 else if (S_ISREG(source_st.st_mode))
457                         touch(where);
458                 else {
459                         log_error("Refusing to create mountpoint for file: %s", *x);
460                         return -ENOTSUP;
461                 }
462
463                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
464                         log_error("mount(%s) failed: %m", where);
465                         return -errno;
466                 }
467
468                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
469                         log_error("mount(%s) failed: %m", where);
470                         return -errno;
471                 }
472         }
473
474         return 0;
475 }
476
477 static int setup_timezone(const char *dest) {
478         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
479         char *z, *y;
480         int r;
481
482         assert(dest);
483
484         /* Fix the timezone, if possible */
485         r = readlink_malloc("/etc/localtime", &p);
486         if (r < 0) {
487                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
488                 return 0;
489         }
490
491         z = path_startswith(p, "../usr/share/zoneinfo/");
492         if (!z)
493                 z = path_startswith(p, "/usr/share/zoneinfo/");
494         if (!z) {
495                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
496                 return 0;
497         }
498
499         where = strappend(dest, "/etc/localtime");
500         if (!where)
501                 return log_oom();
502
503         r = readlink_malloc(where, &q);
504         if (r >= 0) {
505                 y = path_startswith(q, "../usr/share/zoneinfo/");
506                 if (!y)
507                         y = path_startswith(q, "/usr/share/zoneinfo/");
508
509
510                 /* Already pointing to the right place? Then do nothing .. */
511                 if (y && streq(y, z))
512                         return 0;
513         }
514
515         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
516         if (!check)
517                 return log_oom();
518
519         if (access(check, F_OK) < 0) {
520                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
521                 return 0;
522         }
523
524         what = strappend("../usr/share/zoneinfo/", z);
525         if (!what)
526                 return log_oom();
527
528         unlink(where);
529         if (symlink(what, where) < 0) {
530                 log_error("Failed to correct timezone of container: %m");
531                 return 0;
532         }
533
534         return 0;
535 }
536
537 static int setup_resolv_conf(const char *dest) {
538         char _cleanup_free_ *where = NULL;
539
540         assert(dest);
541
542         if (arg_private_network)
543                 return 0;
544
545         /* Fix resolv.conf, if possible */
546         where = strappend(dest, "/etc/resolv.conf");
547         if (!where)
548                 return log_oom();
549
550         /* We don't really care for the results of this really. If it
551          * fails, it fails, but meh... */
552         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
553
554         return 0;
555 }
556
557 static int setup_boot_id(const char *dest) {
558         _cleanup_free_ char *from = NULL, *to = NULL;
559         sd_id128_t rnd;
560         char as_uuid[37];
561         int r;
562
563         assert(dest);
564
565         /* Generate a new randomized boot ID, so that each boot-up of
566          * the container gets a new one */
567
568         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
569         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
570         if (!from || !to)
571                 return log_oom();
572
573         r = sd_id128_randomize(&rnd);
574         if (r < 0) {
575                 log_error("Failed to generate random boot id: %s", strerror(-r));
576                 return r;
577         }
578
579         snprintf(as_uuid, sizeof(as_uuid),
580                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
581                  SD_ID128_FORMAT_VAL(rnd));
582         char_array_0(as_uuid);
583
584         r = write_string_file(from, as_uuid);
585         if (r < 0) {
586                 log_error("Failed to write boot id: %s", strerror(-r));
587                 return r;
588         }
589
590         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
591                 log_error("Failed to bind mount boot id: %m");
592                 r = -errno;
593         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
594                 log_warning("Failed to make boot id read-only: %m");
595
596         unlink(from);
597         return r;
598 }
599
600 static int copy_devnodes(const char *dest) {
601
602         static const char devnodes[] =
603                 "null\0"
604                 "zero\0"
605                 "full\0"
606                 "random\0"
607                 "urandom\0"
608                 "tty\0";
609
610         const char *d;
611         int r = 0;
612         _cleanup_umask_ mode_t u;
613
614         assert(dest);
615
616         u = umask(0000);
617
618         NULSTR_FOREACH(d, devnodes) {
619                 struct stat st;
620                 _cleanup_free_ char *from = NULL, *to = NULL;
621
622                 asprintf(&from, "/dev/%s", d);
623                 asprintf(&to, "%s/dev/%s", dest, d);
624
625                 if (!from || !to) {
626                         log_oom();
627
628                         if (r == 0)
629                                 r = -ENOMEM;
630
631                         break;
632                 }
633
634                 if (stat(from, &st) < 0) {
635
636                         if (errno != ENOENT) {
637                                 log_error("Failed to stat %s: %m", from);
638                                 if (r == 0)
639                                         r = -errno;
640                         }
641
642                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
643
644                         log_error("%s is not a char or block device, cannot copy", from);
645                         if (r == 0)
646                                 r = -EIO;
647
648                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
649
650                         log_error("mknod(%s) failed: %m", dest);
651                         if (r == 0)
652                                 r = -errno;
653                 }
654         }
655
656         return r;
657 }
658
659 static int setup_ptmx(const char *dest) {
660         _cleanup_free_ char *p = NULL;
661
662         p = strappend(dest, "/dev/ptmx");
663         if (!p)
664                 return log_oom();
665
666         if (symlink("pts/ptmx", p) < 0) {
667                 log_error("Failed to create /dev/ptmx symlink: %m");
668                 return -errno;
669         }
670
671         return 0;
672 }
673
674 static int setup_dev_console(const char *dest, const char *console) {
675         struct stat st;
676         _cleanup_free_ char *to = NULL;
677         int r;
678         _cleanup_umask_ mode_t u;
679
680         assert(dest);
681         assert(console);
682
683         u = umask(0000);
684
685         if (stat(console, &st) < 0) {
686                 log_error("Failed to stat %s: %m", console);
687                 return -errno;
688
689         } else if (!S_ISCHR(st.st_mode)) {
690                 log_error("/dev/console is not a char device");
691                 return -EIO;
692         }
693
694         r = chmod_and_chown(console, 0600, 0, 0);
695         if (r < 0) {
696                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
697                 return r;
698         }
699
700         if (asprintf(&to, "%s/dev/console", dest) < 0)
701                 return log_oom();
702
703         /* We need to bind mount the right tty to /dev/console since
704          * ptys can only exist on pts file systems. To have something
705          * to bind mount things on we create a device node first, that
706          * has the right major/minor (note that the major minor
707          * doesn't actually matter here, since we mount it over
708          * anyway). */
709
710         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
711                 log_error("mknod() for /dev/console failed: %m");
712                 return -errno;
713         }
714
715         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
716                 log_error("Bind mount for /dev/console failed: %m");
717                 return -errno;
718         }
719
720         return 0;
721 }
722
723 static int setup_kmsg(const char *dest, int kmsg_socket) {
724         _cleanup_free_ char *from = NULL, *to = NULL;
725         int r, fd, k;
726         _cleanup_umask_ mode_t u;
727         union {
728                 struct cmsghdr cmsghdr;
729                 uint8_t buf[CMSG_SPACE(sizeof(int))];
730         } control = {};
731         struct msghdr mh = {
732                 .msg_control = &control,
733                 .msg_controllen = sizeof(control),
734         };
735         struct cmsghdr *cmsg;
736
737         assert(dest);
738         assert(kmsg_socket >= 0);
739
740         u = umask(0000);
741
742         /* We create the kmsg FIFO as /dev/kmsg, but immediately
743          * delete it after bind mounting it to /proc/kmsg. While FIFOs
744          * on the reading side behave very similar to /proc/kmsg,
745          * their writing side behaves differently from /dev/kmsg in
746          * that writing blocks when nothing is reading. In order to
747          * avoid any problems with containers deadlocking due to this
748          * we simply make /dev/kmsg unavailable to the container. */
749         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
750             asprintf(&to, "%s/proc/kmsg", dest) < 0)
751                 return log_oom();
752
753         if (mkfifo(from, 0600) < 0) {
754                 log_error("mkfifo() for /dev/kmsg failed: %m");
755                 return -errno;
756         }
757
758         r = chmod_and_chown(from, 0600, 0, 0);
759         if (r < 0) {
760                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
761                 return r;
762         }
763
764         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
765                 log_error("Bind mount for /proc/kmsg failed: %m");
766                 return -errno;
767         }
768
769         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
770         if (fd < 0) {
771                 log_error("Failed to open fifo: %m");
772                 return -errno;
773         }
774
775         cmsg = CMSG_FIRSTHDR(&mh);
776         cmsg->cmsg_level = SOL_SOCKET;
777         cmsg->cmsg_type = SCM_RIGHTS;
778         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
779         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
780
781         mh.msg_controllen = cmsg->cmsg_len;
782
783         /* Store away the fd in the socket, so that it stays open as
784          * long as we run the child */
785         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
786         close_nointr_nofail(fd);
787
788         if (k < 0) {
789                 log_error("Failed to send FIFO fd: %m");
790                 return -errno;
791         }
792
793         /* And now make the FIFO unavailable as /dev/kmsg... */
794         unlink(from);
795         return 0;
796 }
797
798 static int setup_hostname(void) {
799
800         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
801                 return -errno;
802
803         return 0;
804 }
805
806 static int setup_journal(const char *directory) {
807         sd_id128_t machine_id;
808         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
809         char *id;
810         int r;
811
812         if (arg_link_journal == LINK_NO)
813                 return 0;
814
815         p = strappend(directory, "/etc/machine-id");
816         if (!p)
817                 return log_oom();
818
819         r = read_one_line_file(p, &b);
820         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
821                 return 0;
822         else if (r < 0) {
823                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
824                 return r;
825         }
826
827         id = strstrip(b);
828         if (isempty(id) && arg_link_journal == LINK_AUTO)
829                 return 0;
830
831         /* Verify validity */
832         r = sd_id128_from_string(id, &machine_id);
833         if (r < 0) {
834                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
835                 return r;
836         }
837
838         free(p);
839         p = strappend("/var/log/journal/", id);
840         q = strjoin(directory, "/var/log/journal/", id, NULL);
841         if (!p || !q)
842                 return log_oom();
843
844         if (path_is_mount_point(p, false) > 0) {
845                 if (arg_link_journal != LINK_AUTO) {
846                         log_error("%s: already a mount point, refusing to use for journal", p);
847                         return -EEXIST;
848                 }
849
850                 return 0;
851         }
852
853         if (path_is_mount_point(q, false) > 0) {
854                 if (arg_link_journal != LINK_AUTO) {
855                         log_error("%s: already a mount point, refusing to use for journal", q);
856                         return -EEXIST;
857                 }
858
859                 return 0;
860         }
861
862         r = readlink_and_make_absolute(p, &d);
863         if (r >= 0) {
864                 if ((arg_link_journal == LINK_GUEST ||
865                      arg_link_journal == LINK_AUTO) &&
866                     path_equal(d, q)) {
867
868                         r = mkdir_p(q, 0755);
869                         if (r < 0)
870                                 log_warning("failed to create directory %s: %m", q);
871                         return 0;
872                 }
873
874                 if (unlink(p) < 0) {
875                         log_error("Failed to remove symlink %s: %m", p);
876                         return -errno;
877                 }
878         } else if (r == -EINVAL) {
879
880                 if (arg_link_journal == LINK_GUEST &&
881                     rmdir(p) < 0) {
882
883                         if (errno == ENOTDIR) {
884                                 log_error("%s already exists and is neither a symlink nor a directory", p);
885                                 return r;
886                         } else {
887                                 log_error("Failed to remove %s: %m", p);
888                                 return -errno;
889                         }
890                 }
891         } else if (r != -ENOENT) {
892                 log_error("readlink(%s) failed: %m", p);
893                 return r;
894         }
895
896         if (arg_link_journal == LINK_GUEST) {
897
898                 if (symlink(q, p) < 0) {
899                         log_error("Failed to symlink %s to %s: %m", q, p);
900                         return -errno;
901                 }
902
903                 r = mkdir_p(q, 0755);
904                 if (r < 0)
905                         log_warning("failed to create directory %s: %m", q);
906                 return 0;
907         }
908
909         if (arg_link_journal == LINK_HOST) {
910                 r = mkdir_p(p, 0755);
911                 if (r < 0) {
912                         log_error("Failed to create %s: %m", p);
913                         return r;
914                 }
915
916         } else if (access(p, F_OK) < 0)
917                 return 0;
918
919         if (dir_is_empty(q) == 0) {
920                 log_error("%s not empty.", q);
921                 return -ENOTEMPTY;
922         }
923
924         r = mkdir_p(q, 0755);
925         if (r < 0) {
926                 log_error("Failed to create %s: %m", q);
927                 return r;
928         }
929
930         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
931                 log_error("Failed to bind mount journal from host into guest: %m");
932                 return -errno;
933         }
934
935         return 0;
936 }
937
938 static int setup_kdbus(const char *dest, const char *path) {
939         const char *p;
940
941         if (!path)
942                 return 0;
943
944         p = strappenda(dest, "/dev/kdbus");
945         if (mkdir(p, 0755) < 0) {
946                 log_error("Failed to create kdbus path: %m");
947                 return  -errno;
948         }
949
950         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
951                 log_error("Failed to mount kdbus namespace path: %m");
952                 return -errno;
953         }
954
955         return 0;
956 }
957
958 static int drop_capabilities(void) {
959         return capability_bounding_set_drop(~arg_retain, false);
960 }
961
962 static int register_machine(void) {
963         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
964         _cleanup_bus_unref_ sd_bus *bus = NULL;
965         int r;
966
967         r = sd_bus_open_system(&bus);
968         if (r < 0) {
969                 log_error("Failed to open system bus: %s", strerror(-r));
970                 return r;
971         }
972
973         r = sd_bus_call_method(
974                         bus,
975                         "org.freedesktop.machine1",
976                         "/org/freedesktop/machine1",
977                         "org.freedesktop.machine1.Manager",
978                         "CreateMachine",
979                         &error,
980                         NULL,
981                         "sayssusa(sv)",
982                         arg_machine,
983                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
984                         "nspawn",
985                         "container",
986                         (uint32_t) 0,
987                         strempty(arg_directory),
988                         !isempty(arg_slice), "Slice", "s", arg_slice);
989         if (r < 0) {
990                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
991                 return r;
992         }
993
994         return 0;
995 }
996
997 static int terminate_machine(pid_t pid) {
998         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
999         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1000         _cleanup_bus_unref_ sd_bus *bus = NULL;
1001         const char *path;
1002         int r;
1003
1004         r = sd_bus_default_system(&bus);
1005         if (r < 0) {
1006                 log_error("Failed to open system bus: %s", strerror(-r));
1007                 return r;
1008         }
1009
1010         r = sd_bus_call_method(
1011                         bus,
1012                         "org.freedesktop.machine1",
1013                         "/org/freedesktop/machine1",
1014                         "org.freedesktop.machine1.Manager",
1015                         "GetMachineByPID",
1016                         &error,
1017                         &reply,
1018                         "u",
1019                         (uint32_t) pid);
1020         if (r < 0) {
1021                 /* Note that the machine might already have been
1022                  * cleaned up automatically, hence don't consider it a
1023                  * failure if we cannot get the machine object. */
1024                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1025                 return 0;
1026         }
1027
1028         r = sd_bus_message_read(reply, "o", &path);
1029         if (r < 0)
1030                 return bus_log_parse_error(r);
1031
1032         r = sd_bus_call_method(
1033                         bus,
1034                         "org.freedesktop.machine1",
1035                         path,
1036                         "org.freedesktop.machine1.Machine",
1037                         "Terminate",
1038                         &error,
1039                         NULL,
1040                         NULL);
1041         if (r < 0) {
1042                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1043                 return 0;
1044         }
1045
1046         return 0;
1047 }
1048
1049 static bool audit_enabled(void) {
1050         int fd;
1051
1052         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1053         if (fd >= 0) {
1054                 close_nointr_nofail(fd);
1055                 return true;
1056         }
1057         return false;
1058 }
1059
1060 int main(int argc, char *argv[]) {
1061         pid_t pid = 0;
1062         int r = EXIT_FAILURE, k;
1063         _cleanup_close_ int master = -1, kdbus_fd = -1;
1064         int n_fd_passed;
1065         const char *console = NULL;
1066         sigset_t mask;
1067         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1068         _cleanup_fdset_free_ FDSet *fds = NULL;
1069         _cleanup_free_ char *kdbus_namespace = NULL;
1070
1071         log_parse_environment();
1072         log_open();
1073
1074         k = parse_argv(argc, argv);
1075         if (k < 0)
1076                 goto finish;
1077         else if (k == 0) {
1078                 r = EXIT_SUCCESS;
1079                 goto finish;
1080         }
1081
1082         if (arg_directory) {
1083                 char *p;
1084
1085                 p = path_make_absolute_cwd(arg_directory);
1086                 free(arg_directory);
1087                 arg_directory = p;
1088         } else
1089                 arg_directory = get_current_dir_name();
1090
1091         if (!arg_directory) {
1092                 log_error("Failed to determine path, please use -D.");
1093                 goto finish;
1094         }
1095
1096         path_kill_slashes(arg_directory);
1097
1098         if (!arg_machine) {
1099                 arg_machine = strdup(basename(arg_directory));
1100                 if (!arg_machine) {
1101                         log_oom();
1102                         goto finish;
1103                 }
1104
1105                 hostname_cleanup(arg_machine, false);
1106                 if (isempty(arg_machine)) {
1107                         log_error("Failed to determine machine name automatically, please use -M.");
1108                         goto finish;
1109                 }
1110         }
1111
1112         if (geteuid() != 0) {
1113                 log_error("Need to be root.");
1114                 goto finish;
1115         }
1116
1117         if (sd_booted() <= 0) {
1118                 log_error("Not running on a systemd system.");
1119                 goto finish;
1120         }
1121
1122         if (arg_boot && audit_enabled()) {
1123                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1124                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1125                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1126                 sleep(5);
1127         }
1128
1129         if (path_equal(arg_directory, "/")) {
1130                 log_error("Spawning container on root directory not supported.");
1131                 goto finish;
1132         }
1133
1134         if (path_is_os_tree(arg_directory) <= 0) {
1135                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1136                 goto finish;
1137         }
1138
1139         log_close();
1140         n_fd_passed = sd_listen_fds(false);
1141         if (n_fd_passed > 0) {
1142                 k = fdset_new_listen_fds(&fds, false);
1143                 if (k < 0) {
1144                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1145                         goto finish;
1146                 }
1147         }
1148         fdset_close_others(fds);
1149         log_open();
1150
1151         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1152         if (master < 0) {
1153                 log_error("Failed to acquire pseudo tty: %m");
1154                 goto finish;
1155         }
1156
1157         console = ptsname(master);
1158         if (!console) {
1159                 log_error("Failed to determine tty name: %m");
1160                 goto finish;
1161         }
1162
1163         log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1164
1165         if (unlockpt(master) < 0) {
1166                 log_error("Failed to unlock tty: %m");
1167                 goto finish;
1168         }
1169
1170         kdbus_fd = bus_kernel_create_namespace(arg_machine, &kdbus_namespace);
1171         if (r < 0)
1172                 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1173         else
1174                 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1175
1176         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1177                 log_error("Failed to create kmsg socket pair.");
1178                 goto finish;
1179         }
1180
1181         sd_notify(0, "READY=1");
1182
1183         assert_se(sigemptyset(&mask) == 0);
1184         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1185         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1186
1187         for (;;) {
1188                 siginfo_t status;
1189
1190                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1191                 if (pid < 0) {
1192                         if (errno == EINVAL)
1193                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1194                         else
1195                                 log_error("clone() failed: %m");
1196
1197                         goto finish;
1198                 }
1199
1200                 if (pid == 0) {
1201                         /* child */
1202                         const char *home = NULL;
1203                         uid_t uid = (uid_t) -1;
1204                         gid_t gid = (gid_t) -1;
1205                         unsigned n_env = 2;
1206                         const char *envp[] = {
1207                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1208                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1209                                 NULL, /* TERM */
1210                                 NULL, /* HOME */
1211                                 NULL, /* USER */
1212                                 NULL, /* LOGNAME */
1213                                 NULL, /* container_uuid */
1214                                 NULL, /* LISTEN_FDS */
1215                                 NULL, /* LISTEN_PID */
1216                                 NULL
1217                         };
1218
1219                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1220                         if (envp[n_env])
1221                                 n_env ++;
1222
1223                         close_nointr_nofail(master);
1224                         master = -1;
1225
1226                         close_nointr(STDIN_FILENO);
1227                         close_nointr(STDOUT_FILENO);
1228                         close_nointr(STDERR_FILENO);
1229
1230                         close_nointr_nofail(kmsg_socket_pair[0]);
1231                         kmsg_socket_pair[0] = -1;
1232
1233                         reset_all_signal_handlers();
1234
1235                         assert_se(sigemptyset(&mask) == 0);
1236                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1237
1238                         k = open_terminal(console, O_RDWR);
1239                         if (k != STDIN_FILENO) {
1240                                 if (k >= 0) {
1241                                         close_nointr_nofail(k);
1242                                         k = -EINVAL;
1243                                 }
1244
1245                                 log_error("Failed to open console: %s", strerror(-k));
1246                                 goto child_fail;
1247                         }
1248
1249                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1250                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1251                                 log_error("Failed to duplicate console: %m");
1252                                 goto child_fail;
1253                         }
1254
1255                         if (setsid() < 0) {
1256                                 log_error("setsid() failed: %m");
1257                                 goto child_fail;
1258                         }
1259
1260                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1261                                 log_error("PR_SET_PDEATHSIG failed: %m");
1262                                 goto child_fail;
1263                         }
1264
1265                         r = register_machine();
1266                         if (r < 0)
1267                                 goto finish;
1268
1269                         /* Mark everything as slave, so that we still
1270                          * receive mounts from the real root, but don't
1271                          * propagate mounts to the real root. */
1272                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1273                                 log_error("MS_SLAVE|MS_REC failed: %m");
1274                                 goto child_fail;
1275                         }
1276
1277                         /* Turn directory into bind mount */
1278                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1279                                 log_error("Failed to make bind mount.");
1280                                 goto child_fail;
1281                         }
1282
1283                         if (arg_read_only)
1284                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1285                                         log_error("Failed to make read-only.");
1286                                         goto child_fail;
1287                                 }
1288
1289                         if (mount_all(arg_directory) < 0)
1290                                 goto child_fail;
1291
1292                         if (copy_devnodes(arg_directory) < 0)
1293                                 goto child_fail;
1294
1295                         if (setup_ptmx(arg_directory) < 0)
1296                                 goto child_fail;
1297
1298                         dev_setup(arg_directory);
1299
1300                         if (setup_dev_console(arg_directory, console) < 0)
1301                                 goto child_fail;
1302
1303                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1304                                 goto child_fail;
1305
1306                         close_nointr_nofail(kmsg_socket_pair[1]);
1307                         kmsg_socket_pair[1] = -1;
1308
1309                         if (setup_boot_id(arg_directory) < 0)
1310                                 goto child_fail;
1311
1312                         if (setup_timezone(arg_directory) < 0)
1313                                 goto child_fail;
1314
1315                         if (setup_resolv_conf(arg_directory) < 0)
1316                                 goto child_fail;
1317
1318                         if (setup_journal(arg_directory) < 0)
1319                                 goto child_fail;
1320
1321                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1322                                 goto child_fail;
1323
1324                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1325                                 goto child_fail;
1326
1327                         if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1328                                 goto child_fail;
1329
1330                         if (chdir(arg_directory) < 0) {
1331                                 log_error("chdir(%s) failed: %m", arg_directory);
1332                                 goto child_fail;
1333                         }
1334
1335                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1336                                 log_error("mount(MS_MOVE) failed: %m");
1337                                 goto child_fail;
1338                         }
1339
1340                         if (chroot(".") < 0) {
1341                                 log_error("chroot() failed: %m");
1342                                 goto child_fail;
1343                         }
1344
1345                         if (chdir("/") < 0) {
1346                                 log_error("chdir() failed: %m");
1347                                 goto child_fail;
1348                         }
1349
1350                         umask(0022);
1351
1352                         loopback_setup();
1353
1354                         if (drop_capabilities() < 0) {
1355                                 log_error("drop_capabilities() failed: %m");
1356                                 goto child_fail;
1357                         }
1358
1359                         if (arg_user) {
1360
1361                                 /* Note that this resolves user names
1362                                  * inside the container, and hence
1363                                  * accesses the NSS modules from the
1364                                  * container and not the host. This is
1365                                  * a bit weird... */
1366
1367                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1368                                         log_error("get_user_creds() failed: %m");
1369                                         goto child_fail;
1370                                 }
1371
1372                                 if (mkdir_parents_label(home, 0775) < 0) {
1373                                         log_error("mkdir_parents_label() failed: %m");
1374                                         goto child_fail;
1375                                 }
1376
1377                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1378                                         log_error("mkdir_safe_label() failed: %m");
1379                                         goto child_fail;
1380                                 }
1381
1382                                 if (initgroups((const char*)arg_user, gid) < 0) {
1383                                         log_error("initgroups() failed: %m");
1384                                         goto child_fail;
1385                                 }
1386
1387                                 if (setresgid(gid, gid, gid) < 0) {
1388                                         log_error("setregid() failed: %m");
1389                                         goto child_fail;
1390                                 }
1391
1392                                 if (setresuid(uid, uid, uid) < 0) {
1393                                         log_error("setreuid() failed: %m");
1394                                         goto child_fail;
1395                                 }
1396                         } else {
1397                                 /* Reset everything fully to 0, just in case */
1398
1399                                 if (setgroups(0, NULL) < 0) {
1400                                         log_error("setgroups() failed: %m");
1401                                         goto child_fail;
1402                                 }
1403
1404                                 if (setresgid(0, 0, 0) < 0) {
1405                                         log_error("setregid() failed: %m");
1406                                         goto child_fail;
1407                                 }
1408
1409                                 if (setresuid(0, 0, 0) < 0) {
1410                                         log_error("setreuid() failed: %m");
1411                                         goto child_fail;
1412                                 }
1413                         }
1414
1415                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1416                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1417                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1418                                 log_oom();
1419                                 goto child_fail;
1420                         }
1421
1422                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1423                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1424                                         log_oom();
1425                                         goto child_fail;
1426                                 }
1427                         }
1428
1429                         if (fdset_size(fds) > 0) {
1430                                 k = fdset_cloexec(fds, false);
1431                                 if (k < 0) {
1432                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1433                                         goto child_fail;
1434                                 }
1435
1436                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1437                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1438                                         log_oom();
1439                                         goto child_fail;
1440                                 }
1441                         }
1442
1443                         setup_hostname();
1444
1445                         if (arg_boot) {
1446                                 char **a;
1447                                 size_t l;
1448
1449                                 /* Automatically search for the init system */
1450
1451                                 l = 1 + argc - optind;
1452                                 a = newa(char*, l + 1);
1453                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1454
1455                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1456                                 execve(a[0], a, (char**) envp);
1457
1458                                 a[0] = (char*) "/lib/systemd/systemd";
1459                                 execve(a[0], a, (char**) envp);
1460
1461                                 a[0] = (char*) "/sbin/init";
1462                                 execve(a[0], a, (char**) envp);
1463                         } else if (argc > optind)
1464                                 execvpe(argv[optind], argv + optind, (char**) envp);
1465                         else {
1466                                 chdir(home ? home : "/root");
1467                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1468                         }
1469
1470                         log_error("execv() failed: %m");
1471
1472                 child_fail:
1473                         _exit(EXIT_FAILURE);
1474                 }
1475
1476                 fdset_free(fds);
1477                 fds = NULL;
1478
1479                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1480                 if (k < 0) {
1481                         r = EXIT_FAILURE;
1482                         break;
1483                 }
1484
1485                 putc('\n', stdout);
1486
1487                 /* Kill if it is not dead yet anyway */
1488                 terminate_machine(pid);
1489
1490                 /* Redundant, but better safe than sorry */
1491                 kill(pid, SIGKILL);
1492
1493                 k = wait_for_terminate(pid, &status);
1494                 pid = 0;
1495
1496                 if (k < 0) {
1497                         r = EXIT_FAILURE;
1498                         break;
1499                 }
1500
1501                 if (status.si_code == CLD_EXITED) {
1502                         r = status.si_status;
1503                         if (status.si_status != 0) {
1504                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1505                                 break;
1506                         }
1507
1508                         log_debug("Container %s exited successfully.", arg_machine);
1509                         break;
1510                 } else if (status.si_code == CLD_KILLED &&
1511                            status.si_status == SIGINT) {
1512                         log_info("Container %s has been shut down.", arg_machine);
1513                         r = 0;
1514                         break;
1515                 } else if (status.si_code == CLD_KILLED &&
1516                            status.si_status == SIGHUP) {
1517                         log_info("Container %s is being rebooted.", arg_machine);
1518                         continue;
1519                 } else if (status.si_code == CLD_KILLED ||
1520                            status.si_code == CLD_DUMPED) {
1521
1522                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1523                         r = EXIT_FAILURE;
1524                         break;
1525                 } else {
1526                         log_error("Container %s failed due to unknown reason.", arg_machine);
1527                         r = EXIT_FAILURE;
1528                         break;
1529                 }
1530         }
1531
1532 finish:
1533         if (pid > 0)
1534                 kill(pid, SIGKILL);
1535
1536         free(arg_directory);
1537         free(arg_machine);
1538
1539         return r;
1540 }