chiark / gitweb /
nspawn: set up a kdbus namespace when starting a container
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43
44 #include "sd-daemon.h"
45 #include "sd-bus.h"
46 #include "sd-id128.h"
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60 #include "fileio.h"
61 #include "bus-util.h"
62 #include "bus-error.h"
63 #include "ptyfwd.h"
64 #include "bus-kernel.h"
65
66 #ifndef TTY_GID
67 #define TTY_GID 5
68 #endif
69
70 typedef enum LinkJournal {
71         LINK_NO,
72         LINK_AUTO,
73         LINK_HOST,
74         LINK_GUEST
75 } LinkJournal;
76
77 static char *arg_directory = NULL;
78 static char *arg_user = NULL;
79 static sd_id128_t arg_uuid = {};
80 static char *arg_machine = NULL;
81 static const char *arg_slice = NULL;
82 static bool arg_private_network = false;
83 static bool arg_read_only = false;
84 static bool arg_boot = false;
85 static LinkJournal arg_link_journal = LINK_AUTO;
86 static uint64_t arg_retain =
87         (1ULL << CAP_CHOWN) |
88         (1ULL << CAP_DAC_OVERRIDE) |
89         (1ULL << CAP_DAC_READ_SEARCH) |
90         (1ULL << CAP_FOWNER) |
91         (1ULL << CAP_FSETID) |
92         (1ULL << CAP_IPC_OWNER) |
93         (1ULL << CAP_KILL) |
94         (1ULL << CAP_LEASE) |
95         (1ULL << CAP_LINUX_IMMUTABLE) |
96         (1ULL << CAP_NET_BIND_SERVICE) |
97         (1ULL << CAP_NET_BROADCAST) |
98         (1ULL << CAP_NET_RAW) |
99         (1ULL << CAP_SETGID) |
100         (1ULL << CAP_SETFCAP) |
101         (1ULL << CAP_SETPCAP) |
102         (1ULL << CAP_SETUID) |
103         (1ULL << CAP_SYS_ADMIN) |
104         (1ULL << CAP_SYS_CHROOT) |
105         (1ULL << CAP_SYS_NICE) |
106         (1ULL << CAP_SYS_PTRACE) |
107         (1ULL << CAP_SYS_TTY_CONFIG) |
108         (1ULL << CAP_SYS_RESOURCE) |
109         (1ULL << CAP_SYS_BOOT) |
110         (1ULL << CAP_AUDIT_WRITE) |
111         (1ULL << CAP_AUDIT_CONTROL);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
114
115 static int help(void) {
116
117         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
118                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
119                "  -h --help                Show this help\n"
120                "     --version             Print version string\n"
121                "  -D --directory=NAME      Root directory for the container\n"
122                "  -b --boot                Boot up full system (i.e. invoke init)\n"
123                "  -u --user=USER           Run the command under specified user or uid\n"
124                "     --uuid=UUID           Set a specific machine UUID for the container\n"
125                "  -M --machine=NAME        Set the machine name for the container\n"
126                "  -S --slice=SLICE         Place the container in the specified slice\n"
127                "     --private-network     Disable network in container\n"
128                "     --read-only           Mount the root directory read-only\n"
129                "     --capability=CAP      In addition to the default, retain specified\n"
130                "                           capability\n"
131                "     --drop-capability=CAP Drop the specified capability from the default set\n"
132                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
133                "  -j                       Equivalent to --link-journal=host\n"
134                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
135                "                           the container\n"
136                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137                program_invocation_short_name);
138
139         return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144         enum {
145                 ARG_VERSION = 0x100,
146                 ARG_PRIVATE_NETWORK,
147                 ARG_UUID,
148                 ARG_READ_ONLY,
149                 ARG_CAPABILITY,
150                 ARG_DROP_CAPABILITY,
151                 ARG_LINK_JOURNAL,
152                 ARG_BIND,
153                 ARG_BIND_RO
154         };
155
156         static const struct option options[] = {
157                 { "help",            no_argument,       NULL, 'h'                 },
158                 { "version",         no_argument,       NULL, ARG_VERSION         },
159                 { "directory",       required_argument, NULL, 'D'                 },
160                 { "user",            required_argument, NULL, 'u'                 },
161                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
162                 { "boot",            no_argument,       NULL, 'b'                 },
163                 { "uuid",            required_argument, NULL, ARG_UUID            },
164                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
165                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
166                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
167                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
168                 { "bind",            required_argument, NULL, ARG_BIND            },
169                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
170                 { "machine",         required_argument, NULL, 'M'                 },
171                 { "slice",           required_argument, NULL, 'S'                 },
172                 {}
173         };
174
175         int c, r;
176
177         assert(argc >= 0);
178         assert(argv);
179
180         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
181
182                 switch (c) {
183
184                 case 'h':
185                         return help();
186
187                 case ARG_VERSION:
188                         puts(PACKAGE_STRING);
189                         puts(SYSTEMD_FEATURES);
190                         return 0;
191
192                 case 'D':
193                         free(arg_directory);
194                         arg_directory = canonicalize_file_name(optarg);
195                         if (!arg_directory) {
196                                 log_error("Invalid root directory: %m");
197                                 return -ENOMEM;
198                         }
199
200                         break;
201
202                 case 'u':
203                         free(arg_user);
204                         arg_user = strdup(optarg);
205                         if (!arg_user)
206                                 return log_oom();
207
208                         break;
209
210                 case ARG_PRIVATE_NETWORK:
211                         arg_private_network = true;
212                         break;
213
214                 case 'b':
215                         arg_boot = true;
216                         break;
217
218                 case ARG_UUID:
219                         r = sd_id128_from_string(optarg, &arg_uuid);
220                         if (r < 0) {
221                                 log_error("Invalid UUID: %s", optarg);
222                                 return r;
223                         }
224                         break;
225
226                 case 'S':
227                         arg_slice = strdup(optarg);
228                         if (!arg_slice)
229                                 return log_oom();
230
231                         break;
232
233                 case 'M':
234                         if (!hostname_is_valid(optarg)) {
235                                 log_error("Invalid machine name: %s", optarg);
236                                 return -EINVAL;
237                         }
238
239                         free(arg_machine);
240                         arg_machine = strdup(optarg);
241                         if (!arg_machine)
242                                 return log_oom();
243
244                         break;
245
246                 case ARG_READ_ONLY:
247                         arg_read_only = true;
248                         break;
249
250                 case ARG_CAPABILITY:
251                 case ARG_DROP_CAPABILITY: {
252                         char *state, *word;
253                         size_t length;
254
255                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
256                                 cap_value_t cap;
257                                 char *t;
258
259                                 t = strndup(word, length);
260                                 if (!t)
261                                         return log_oom();
262
263                                 if (cap_from_name(t, &cap) < 0) {
264                                         log_error("Failed to parse capability %s.", t);
265                                         free(t);
266                                         return -EINVAL;
267                                 }
268
269                                 free(t);
270
271                                 if (c == ARG_CAPABILITY)
272                                         arg_retain |= 1ULL << (uint64_t) cap;
273                                 else
274                                         arg_retain &= ~(1ULL << (uint64_t) cap);
275                         }
276
277                         break;
278                 }
279
280                 case 'j':
281                         arg_link_journal = LINK_GUEST;
282                         break;
283
284                 case ARG_LINK_JOURNAL:
285                         if (streq(optarg, "auto"))
286                                 arg_link_journal = LINK_AUTO;
287                         else if (streq(optarg, "no"))
288                                 arg_link_journal = LINK_NO;
289                         else if (streq(optarg, "guest"))
290                                 arg_link_journal = LINK_GUEST;
291                         else if (streq(optarg, "host"))
292                                 arg_link_journal = LINK_HOST;
293                         else {
294                                 log_error("Failed to parse link journal mode %s", optarg);
295                                 return -EINVAL;
296                         }
297
298                         break;
299
300                 case ARG_BIND:
301                 case ARG_BIND_RO: {
302                         _cleanup_free_ char *a = NULL, *b = NULL;
303                         char *e;
304                         char ***x;
305
306                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
307
308                         e = strchr(optarg, ':');
309                         if (e) {
310                                 a = strndup(optarg, e - optarg);
311                                 b = strdup(e + 1);
312                         } else {
313                                 a = strdup(optarg);
314                                 b = strdup(optarg);
315                         }
316
317                         if (!a || !b)
318                                 return log_oom();
319
320                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
321                                 log_error("Invalid bind mount specification: %s", optarg);
322                                 return -EINVAL;
323                         }
324
325                         r = strv_extend(x, a);
326                         if (r < 0)
327                                 return log_oom();
328
329                         r = strv_extend(x, b);
330                         if (r < 0)
331                                 return log_oom();
332
333                         break;
334                 }
335
336                 case '?':
337                         return -EINVAL;
338
339                 default:
340                         assert_not_reached("Unhandled option");
341                 }
342         }
343
344         return 1;
345 }
346
347 static int mount_all(const char *dest) {
348
349         typedef struct MountPoint {
350                 const char *what;
351                 const char *where;
352                 const char *type;
353                 const char *options;
354                 unsigned long flags;
355                 bool fatal;
356         } MountPoint;
357
358         static const MountPoint mount_table[] = {
359                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
361                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
362                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
363                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
364                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
366                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
367 #ifdef HAVE_SELINUX
368                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
369                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
370 #endif
371         };
372
373         unsigned k;
374         int r = 0;
375
376         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377                 _cleanup_free_ char *where = NULL;
378                 int t;
379
380                 where = strjoin(dest, "/", mount_table[k].where, NULL);
381                 if (!where)
382                         return log_oom();
383
384                 t = path_is_mount_point(where, true);
385                 if (t < 0) {
386                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387
388                         if (r == 0)
389                                 r = t;
390
391                         continue;
392                 }
393
394                 /* Skip this entry if it is not a remount. */
395                 if (mount_table[k].what && t > 0)
396                         continue;
397
398                 mkdir_p(where, 0755);
399
400                 if (mount(mount_table[k].what,
401                           where,
402                           mount_table[k].type,
403                           mount_table[k].flags,
404                           mount_table[k].options) < 0 &&
405                     mount_table[k].fatal) {
406
407                         log_error("mount(%s) failed: %m", where);
408
409                         if (r == 0)
410                                 r = -errno;
411                 }
412         }
413
414         return r;
415 }
416
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
418         char **x, **y;
419
420         STRV_FOREACH_PAIR(x, y, l) {
421                 _cleanup_free_ char *where = NULL;
422                 struct stat source_st, dest_st;
423
424                 if (stat(*x, &source_st) < 0) {
425                         log_error("failed to stat %s: %m", *x);
426                         return -errno;
427                 }
428
429                 where = strjoin(dest, "/", *y, NULL);
430                 if (!where)
431                         return log_oom();
432
433                 if (stat(where, &dest_st) == 0) {
434                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
435                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
436                                                 *x, where);
437                                 return -EINVAL;
438                         }
439                 } else {
440                         /* Create the mount point, but be conservative -- refuse to create block
441                          * and char devices. */
442                         if (S_ISDIR(source_st.st_mode))
443                                 mkdir_p_label(where, 0755);
444                         else if (S_ISFIFO(source_st.st_mode))
445                                 mkfifo(where, 0644);
446                         else if (S_ISSOCK(source_st.st_mode))
447                                 mknod(where, 0644 | S_IFSOCK, 0);
448                         else if (S_ISREG(source_st.st_mode))
449                                 touch(where);
450                         else {
451                                 log_error("Refusing to create mountpoint for file: %s", *x);
452                                 return -ENOTSUP;
453                         }
454                 }
455
456                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
457                         log_error("mount(%s) failed: %m", where);
458                         return -errno;
459                 }
460
461                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
462                         log_error("mount(%s) failed: %m", where);
463                         return -errno;
464                 }
465         }
466
467         return 0;
468 }
469
470 static int setup_timezone(const char *dest) {
471         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
472         char *z, *y;
473         int r;
474
475         assert(dest);
476
477         /* Fix the timezone, if possible */
478         r = readlink_malloc("/etc/localtime", &p);
479         if (r < 0) {
480                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
481                 return 0;
482         }
483
484         z = path_startswith(p, "../usr/share/zoneinfo/");
485         if (!z)
486                 z = path_startswith(p, "/usr/share/zoneinfo/");
487         if (!z) {
488                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
489                 return 0;
490         }
491
492         where = strappend(dest, "/etc/localtime");
493         if (!where)
494                 return log_oom();
495
496         r = readlink_malloc(where, &q);
497         if (r >= 0) {
498                 y = path_startswith(q, "../usr/share/zoneinfo/");
499                 if (!y)
500                         y = path_startswith(q, "/usr/share/zoneinfo/");
501
502
503                 /* Already pointing to the right place? Then do nothing .. */
504                 if (y && streq(y, z))
505                         return 0;
506         }
507
508         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
509         if (!check)
510                 return log_oom();
511
512         if (access(check, F_OK) < 0) {
513                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
514                 return 0;
515         }
516
517         what = strappend("../usr/share/zoneinfo/", z);
518         if (!what)
519                 return log_oom();
520
521         unlink(where);
522         if (symlink(what, where) < 0) {
523                 log_error("Failed to correct timezone of container: %m");
524                 return 0;
525         }
526
527         return 0;
528 }
529
530 static int setup_resolv_conf(const char *dest) {
531         char _cleanup_free_ *where = NULL;
532
533         assert(dest);
534
535         if (arg_private_network)
536                 return 0;
537
538         /* Fix resolv.conf, if possible */
539         where = strappend(dest, "/etc/resolv.conf");
540         if (!where)
541                 return log_oom();
542
543         /* We don't really care for the results of this really. If it
544          * fails, it fails, but meh... */
545         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
546
547         return 0;
548 }
549
550 static int setup_boot_id(const char *dest) {
551         _cleanup_free_ char *from = NULL, *to = NULL;
552         sd_id128_t rnd;
553         char as_uuid[37];
554         int r;
555
556         assert(dest);
557
558         /* Generate a new randomized boot ID, so that each boot-up of
559          * the container gets a new one */
560
561         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
562         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
563         if (!from || !to)
564                 return log_oom();
565
566         r = sd_id128_randomize(&rnd);
567         if (r < 0) {
568                 log_error("Failed to generate random boot id: %s", strerror(-r));
569                 return r;
570         }
571
572         snprintf(as_uuid, sizeof(as_uuid),
573                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
574                  SD_ID128_FORMAT_VAL(rnd));
575         char_array_0(as_uuid);
576
577         r = write_string_file(from, as_uuid);
578         if (r < 0) {
579                 log_error("Failed to write boot id: %s", strerror(-r));
580                 return r;
581         }
582
583         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
584                 log_error("Failed to bind mount boot id: %m");
585                 r = -errno;
586         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
587                 log_warning("Failed to make boot id read-only: %m");
588
589         unlink(from);
590         return r;
591 }
592
593 static int copy_devnodes(const char *dest) {
594
595         static const char devnodes[] =
596                 "null\0"
597                 "zero\0"
598                 "full\0"
599                 "random\0"
600                 "urandom\0"
601                 "tty\0";
602
603         const char *d;
604         int r = 0;
605         _cleanup_umask_ mode_t u;
606
607         assert(dest);
608
609         u = umask(0000);
610
611         NULSTR_FOREACH(d, devnodes) {
612                 struct stat st;
613                 _cleanup_free_ char *from = NULL, *to = NULL;
614
615                 asprintf(&from, "/dev/%s", d);
616                 asprintf(&to, "%s/dev/%s", dest, d);
617
618                 if (!from || !to) {
619                         log_oom();
620
621                         if (r == 0)
622                                 r = -ENOMEM;
623
624                         break;
625                 }
626
627                 if (stat(from, &st) < 0) {
628
629                         if (errno != ENOENT) {
630                                 log_error("Failed to stat %s: %m", from);
631                                 if (r == 0)
632                                         r = -errno;
633                         }
634
635                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
636
637                         log_error("%s is not a char or block device, cannot copy", from);
638                         if (r == 0)
639                                 r = -EIO;
640
641                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
642
643                         log_error("mknod(%s) failed: %m", dest);
644                         if (r == 0)
645                                 r = -errno;
646                 }
647         }
648
649         return r;
650 }
651
652 static int setup_ptmx(const char *dest) {
653         _cleanup_free_ char *p = NULL;
654
655         p = strappend(dest, "/dev/ptmx");
656         if (!p)
657                 return log_oom();
658
659         if (symlink("pts/ptmx", p) < 0) {
660                 log_error("Failed to create /dev/ptmx symlink: %m");
661                 return -errno;
662         }
663
664         return 0;
665 }
666
667 static int setup_dev_console(const char *dest, const char *console) {
668         struct stat st;
669         _cleanup_free_ char *to = NULL;
670         int r;
671         _cleanup_umask_ mode_t u;
672
673         assert(dest);
674         assert(console);
675
676         u = umask(0000);
677
678         if (stat(console, &st) < 0) {
679                 log_error("Failed to stat %s: %m", console);
680                 return -errno;
681
682         } else if (!S_ISCHR(st.st_mode)) {
683                 log_error("/dev/console is not a char device");
684                 return -EIO;
685         }
686
687         r = chmod_and_chown(console, 0600, 0, 0);
688         if (r < 0) {
689                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
690                 return r;
691         }
692
693         if (asprintf(&to, "%s/dev/console", dest) < 0)
694                 return log_oom();
695
696         /* We need to bind mount the right tty to /dev/console since
697          * ptys can only exist on pts file systems. To have something
698          * to bind mount things on we create a device node first, that
699          * has the right major/minor (note that the major minor
700          * doesn't actually matter here, since we mount it over
701          * anyway). */
702
703         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
704                 log_error("mknod() for /dev/console failed: %m");
705                 return -errno;
706         }
707
708         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
709                 log_error("Bind mount for /dev/console failed: %m");
710                 return -errno;
711         }
712
713         return 0;
714 }
715
716 static int setup_kmsg(const char *dest, int kmsg_socket) {
717         _cleanup_free_ char *from = NULL, *to = NULL;
718         int r, fd, k;
719         _cleanup_umask_ mode_t u;
720         union {
721                 struct cmsghdr cmsghdr;
722                 uint8_t buf[CMSG_SPACE(sizeof(int))];
723         } control = {};
724         struct msghdr mh = {
725                 .msg_control = &control,
726                 .msg_controllen = sizeof(control),
727         };
728         struct cmsghdr *cmsg;
729
730         assert(dest);
731         assert(kmsg_socket >= 0);
732
733         u = umask(0000);
734
735         /* We create the kmsg FIFO as /dev/kmsg, but immediately
736          * delete it after bind mounting it to /proc/kmsg. While FIFOs
737          * on the reading side behave very similar to /proc/kmsg,
738          * their writing side behaves differently from /dev/kmsg in
739          * that writing blocks when nothing is reading. In order to
740          * avoid any problems with containers deadlocking due to this
741          * we simply make /dev/kmsg unavailable to the container. */
742         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
743             asprintf(&to, "%s/proc/kmsg", dest) < 0)
744                 return log_oom();
745
746         if (mkfifo(from, 0600) < 0) {
747                 log_error("mkfifo() for /dev/kmsg failed: %m");
748                 return -errno;
749         }
750
751         r = chmod_and_chown(from, 0600, 0, 0);
752         if (r < 0) {
753                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
754                 return r;
755         }
756
757         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
758                 log_error("Bind mount for /proc/kmsg failed: %m");
759                 return -errno;
760         }
761
762         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
763         if (fd < 0) {
764                 log_error("Failed to open fifo: %m");
765                 return -errno;
766         }
767
768         cmsg = CMSG_FIRSTHDR(&mh);
769         cmsg->cmsg_level = SOL_SOCKET;
770         cmsg->cmsg_type = SCM_RIGHTS;
771         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
772         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
773
774         mh.msg_controllen = cmsg->cmsg_len;
775
776         /* Store away the fd in the socket, so that it stays open as
777          * long as we run the child */
778         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
779         close_nointr_nofail(fd);
780
781         if (k < 0) {
782                 log_error("Failed to send FIFO fd: %m");
783                 return -errno;
784         }
785
786         /* And now make the FIFO unavailable as /dev/kmsg... */
787         unlink(from);
788         return 0;
789 }
790
791 static int setup_hostname(void) {
792
793         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
794                 return -errno;
795
796         return 0;
797 }
798
799 static int setup_journal(const char *directory) {
800         sd_id128_t machine_id;
801         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
802         char *id;
803         int r;
804
805         if (arg_link_journal == LINK_NO)
806                 return 0;
807
808         p = strappend(directory, "/etc/machine-id");
809         if (!p)
810                 return log_oom();
811
812         r = read_one_line_file(p, &b);
813         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
814                 return 0;
815         else if (r < 0) {
816                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
817                 return r;
818         }
819
820         id = strstrip(b);
821         if (isempty(id) && arg_link_journal == LINK_AUTO)
822                 return 0;
823
824         /* Verify validity */
825         r = sd_id128_from_string(id, &machine_id);
826         if (r < 0) {
827                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
828                 return r;
829         }
830
831         free(p);
832         p = strappend("/var/log/journal/", id);
833         q = strjoin(directory, "/var/log/journal/", id, NULL);
834         if (!p || !q)
835                 return log_oom();
836
837         if (path_is_mount_point(p, false) > 0) {
838                 if (arg_link_journal != LINK_AUTO) {
839                         log_error("%s: already a mount point, refusing to use for journal", p);
840                         return -EEXIST;
841                 }
842
843                 return 0;
844         }
845
846         if (path_is_mount_point(q, false) > 0) {
847                 if (arg_link_journal != LINK_AUTO) {
848                         log_error("%s: already a mount point, refusing to use for journal", q);
849                         return -EEXIST;
850                 }
851
852                 return 0;
853         }
854
855         r = readlink_and_make_absolute(p, &d);
856         if (r >= 0) {
857                 if ((arg_link_journal == LINK_GUEST ||
858                      arg_link_journal == LINK_AUTO) &&
859                     path_equal(d, q)) {
860
861                         r = mkdir_p(q, 0755);
862                         if (r < 0)
863                                 log_warning("failed to create directory %s: %m", q);
864                         return 0;
865                 }
866
867                 if (unlink(p) < 0) {
868                         log_error("Failed to remove symlink %s: %m", p);
869                         return -errno;
870                 }
871         } else if (r == -EINVAL) {
872
873                 if (arg_link_journal == LINK_GUEST &&
874                     rmdir(p) < 0) {
875
876                         if (errno == ENOTDIR) {
877                                 log_error("%s already exists and is neither a symlink nor a directory", p);
878                                 return r;
879                         } else {
880                                 log_error("Failed to remove %s: %m", p);
881                                 return -errno;
882                         }
883                 }
884         } else if (r != -ENOENT) {
885                 log_error("readlink(%s) failed: %m", p);
886                 return r;
887         }
888
889         if (arg_link_journal == LINK_GUEST) {
890
891                 if (symlink(q, p) < 0) {
892                         log_error("Failed to symlink %s to %s: %m", q, p);
893                         return -errno;
894                 }
895
896                 r = mkdir_p(q, 0755);
897                 if (r < 0)
898                         log_warning("failed to create directory %s: %m", q);
899                 return 0;
900         }
901
902         if (arg_link_journal == LINK_HOST) {
903                 r = mkdir_p(p, 0755);
904                 if (r < 0) {
905                         log_error("Failed to create %s: %m", p);
906                         return r;
907                 }
908
909         } else if (access(p, F_OK) < 0)
910                 return 0;
911
912         if (dir_is_empty(q) == 0) {
913                 log_error("%s not empty.", q);
914                 return -ENOTEMPTY;
915         }
916
917         r = mkdir_p(q, 0755);
918         if (r < 0) {
919                 log_error("Failed to create %s: %m", q);
920                 return r;
921         }
922
923         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
924                 log_error("Failed to bind mount journal from host into guest: %m");
925                 return -errno;
926         }
927
928         return 0;
929 }
930
931 static int setup_kdbus(const char *dest, const char *path) {
932         const char *p;
933
934         if (!path)
935                 return 0;
936
937         p = strappenda(dest, "/dev/kdbus");
938         if (mkdir(p, 0755) < 0) {
939                 log_error("Failed to create kdbus path: %m");
940                 return  -errno;
941         }
942
943         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
944                 log_error("Failed to mount kdbus namespace path: %m");
945                 return -errno;
946         }
947
948         return 0;
949 }
950
951 static int drop_capabilities(void) {
952         return capability_bounding_set_drop(~arg_retain, false);
953 }
954
955 static int register_machine(void) {
956         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
957         _cleanup_bus_unref_ sd_bus *bus = NULL;
958         int r;
959
960         r = sd_bus_open_system(&bus);
961         if (r < 0) {
962                 log_error("Failed to open system bus: %s", strerror(-r));
963                 return r;
964         }
965
966         r = sd_bus_call_method(
967                         bus,
968                         "org.freedesktop.machine1",
969                         "/org/freedesktop/machine1",
970                         "org.freedesktop.machine1.Manager",
971                         "CreateMachine",
972                         &error,
973                         NULL,
974                         "sayssusa(sv)",
975                         arg_machine,
976                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
977                         "nspawn",
978                         "container",
979                         (uint32_t) 0,
980                         strempty(arg_directory),
981                         !isempty(arg_slice), "Slice", "s", arg_slice);
982         if (r < 0) {
983                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
984                 return r;
985         }
986
987         return 0;
988 }
989
990 static int terminate_machine(pid_t pid) {
991         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
992         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
993         _cleanup_bus_unref_ sd_bus *bus = NULL;
994         const char *path;
995         int r;
996
997         r = sd_bus_default_system(&bus);
998         if (r < 0) {
999                 log_error("Failed to open system bus: %s", strerror(-r));
1000                 return r;
1001         }
1002
1003         r = sd_bus_call_method(
1004                         bus,
1005                         "org.freedesktop.machine1",
1006                         "/org/freedesktop/machine1",
1007                         "org.freedesktop.machine1.Manager",
1008                         "GetMachineByPID",
1009                         &error,
1010                         &reply,
1011                         "u",
1012                         (uint32_t) pid);
1013         if (r < 0) {
1014                 /* Note that the machine might already have been
1015                  * cleaned up automatically, hence don't consider it a
1016                  * failure if we cannot get the machine object. */
1017                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1018                 return 0;
1019         }
1020
1021         r = sd_bus_message_read(reply, "o", &path);
1022         if (r < 0)
1023                 return bus_log_parse_error(r);
1024
1025         r = sd_bus_call_method(
1026                         bus,
1027                         "org.freedesktop.machine1",
1028                         path,
1029                         "org.freedesktop.machine1.Machine",
1030                         "Terminate",
1031                         &error,
1032                         NULL,
1033                         NULL);
1034         if (r < 0) {
1035                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1036                 return 0;
1037         }
1038
1039         return 0;
1040 }
1041
1042 static bool audit_enabled(void) {
1043         int fd;
1044
1045         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1046         if (fd >= 0) {
1047                 close_nointr_nofail(fd);
1048                 return true;
1049         }
1050         return false;
1051 }
1052
1053 int main(int argc, char *argv[]) {
1054         pid_t pid = 0;
1055         int r = EXIT_FAILURE, k;
1056         _cleanup_close_ int master = -1, kdbus_fd = -1;
1057         int n_fd_passed;
1058         const char *console = NULL;
1059         sigset_t mask;
1060         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1061         _cleanup_fdset_free_ FDSet *fds = NULL;
1062         _cleanup_free_ char *kdbus_namespace = NULL;
1063
1064         log_parse_environment();
1065         log_open();
1066
1067         k = parse_argv(argc, argv);
1068         if (k < 0)
1069                 goto finish;
1070         else if (k == 0) {
1071                 r = EXIT_SUCCESS;
1072                 goto finish;
1073         }
1074
1075         if (arg_directory) {
1076                 char *p;
1077
1078                 p = path_make_absolute_cwd(arg_directory);
1079                 free(arg_directory);
1080                 arg_directory = p;
1081         } else
1082                 arg_directory = get_current_dir_name();
1083
1084         if (!arg_directory) {
1085                 log_error("Failed to determine path, please use -D.");
1086                 goto finish;
1087         }
1088
1089         path_kill_slashes(arg_directory);
1090
1091         if (!arg_machine) {
1092                 arg_machine = strdup(path_get_file_name(arg_directory));
1093                 if (!arg_machine) {
1094                         log_oom();
1095                         goto finish;
1096                 }
1097
1098                 hostname_cleanup(arg_machine, false);
1099                 if (isempty(arg_machine)) {
1100                         log_error("Failed to determine machine name automatically, please use -M.");
1101                         goto finish;
1102                 }
1103         }
1104
1105         if (geteuid() != 0) {
1106                 log_error("Need to be root.");
1107                 goto finish;
1108         }
1109
1110         if (sd_booted() <= 0) {
1111                 log_error("Not running on a systemd system.");
1112                 goto finish;
1113         }
1114
1115         if (arg_boot && audit_enabled()) {
1116                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1117                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1118                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1119                 sleep(5);
1120         }
1121
1122         if (path_equal(arg_directory, "/")) {
1123                 log_error("Spawning container on root directory not supported.");
1124                 goto finish;
1125         }
1126
1127         if (path_is_os_tree(arg_directory) <= 0) {
1128                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1129                 goto finish;
1130         }
1131
1132         log_close();
1133         n_fd_passed = sd_listen_fds(false);
1134         if (n_fd_passed > 0) {
1135                 k = fdset_new_listen_fds(&fds, false);
1136                 if (k < 0) {
1137                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1138                         goto finish;
1139                 }
1140         }
1141         fdset_close_others(fds);
1142         log_open();
1143
1144         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1145         if (master < 0) {
1146                 log_error("Failed to acquire pseudo tty: %m");
1147                 goto finish;
1148         }
1149
1150         console = ptsname(master);
1151         if (!console) {
1152                 log_error("Failed to determine tty name: %m");
1153                 goto finish;
1154         }
1155
1156         log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1157
1158         if (unlockpt(master) < 0) {
1159                 log_error("Failed to unlock tty: %m");
1160                 goto finish;
1161         }
1162
1163         kdbus_fd = bus_kernel_create_namespace(arg_machine, &kdbus_namespace);
1164         if (r < 0)
1165                 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1166         else
1167                 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1168
1169         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1170                 log_error("Failed to create kmsg socket pair.");
1171                 goto finish;
1172         }
1173
1174         sd_notify(0, "READY=1");
1175
1176         assert_se(sigemptyset(&mask) == 0);
1177         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1178         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1179
1180         for (;;) {
1181                 siginfo_t status;
1182
1183                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1184                 if (pid < 0) {
1185                         if (errno == EINVAL)
1186                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1187                         else
1188                                 log_error("clone() failed: %m");
1189
1190                         goto finish;
1191                 }
1192
1193                 if (pid == 0) {
1194                         /* child */
1195                         const char *home = NULL;
1196                         uid_t uid = (uid_t) -1;
1197                         gid_t gid = (gid_t) -1;
1198                         unsigned n_env = 2;
1199                         const char *envp[] = {
1200                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1201                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1202                                 NULL, /* TERM */
1203                                 NULL, /* HOME */
1204                                 NULL, /* USER */
1205                                 NULL, /* LOGNAME */
1206                                 NULL, /* container_uuid */
1207                                 NULL, /* LISTEN_FDS */
1208                                 NULL, /* LISTEN_PID */
1209                                 NULL
1210                         };
1211
1212                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1213                         if (envp[n_env])
1214                                 n_env ++;
1215
1216                         close_nointr_nofail(master);
1217                         master = -1;
1218
1219                         close_nointr(STDIN_FILENO);
1220                         close_nointr(STDOUT_FILENO);
1221                         close_nointr(STDERR_FILENO);
1222
1223                         close_nointr_nofail(kmsg_socket_pair[0]);
1224                         kmsg_socket_pair[0] = -1;
1225
1226                         reset_all_signal_handlers();
1227
1228                         assert_se(sigemptyset(&mask) == 0);
1229                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1230
1231                         k = open_terminal(console, O_RDWR);
1232                         if (k != STDIN_FILENO) {
1233                                 if (k >= 0) {
1234                                         close_nointr_nofail(k);
1235                                         k = -EINVAL;
1236                                 }
1237
1238                                 log_error("Failed to open console: %s", strerror(-k));
1239                                 goto child_fail;
1240                         }
1241
1242                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1243                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1244                                 log_error("Failed to duplicate console: %m");
1245                                 goto child_fail;
1246                         }
1247
1248                         if (setsid() < 0) {
1249                                 log_error("setsid() failed: %m");
1250                                 goto child_fail;
1251                         }
1252
1253                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1254                                 log_error("PR_SET_PDEATHSIG failed: %m");
1255                                 goto child_fail;
1256                         }
1257
1258                         r = register_machine();
1259                         if (r < 0)
1260                                 goto finish;
1261
1262                         /* Mark everything as slave, so that we still
1263                          * receive mounts from the real root, but don't
1264                          * propagate mounts to the real root. */
1265                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1266                                 log_error("MS_SLAVE|MS_REC failed: %m");
1267                                 goto child_fail;
1268                         }
1269
1270                         /* Turn directory into bind mount */
1271                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1272                                 log_error("Failed to make bind mount.");
1273                                 goto child_fail;
1274                         }
1275
1276                         if (arg_read_only)
1277                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1278                                         log_error("Failed to make read-only.");
1279                                         goto child_fail;
1280                                 }
1281
1282                         if (mount_all(arg_directory) < 0)
1283                                 goto child_fail;
1284
1285                         if (copy_devnodes(arg_directory) < 0)
1286                                 goto child_fail;
1287
1288                         if (setup_ptmx(arg_directory) < 0)
1289                                 goto child_fail;
1290
1291                         dev_setup(arg_directory);
1292
1293                         if (setup_dev_console(arg_directory, console) < 0)
1294                                 goto child_fail;
1295
1296                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1297                                 goto child_fail;
1298
1299                         close_nointr_nofail(kmsg_socket_pair[1]);
1300                         kmsg_socket_pair[1] = -1;
1301
1302                         if (setup_boot_id(arg_directory) < 0)
1303                                 goto child_fail;
1304
1305                         if (setup_timezone(arg_directory) < 0)
1306                                 goto child_fail;
1307
1308                         if (setup_resolv_conf(arg_directory) < 0)
1309                                 goto child_fail;
1310
1311                         if (setup_journal(arg_directory) < 0)
1312                                 goto child_fail;
1313
1314                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1315                                 goto child_fail;
1316
1317                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1318                                 goto child_fail;
1319
1320                         if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1321                                 goto child_fail;
1322
1323                         if (chdir(arg_directory) < 0) {
1324                                 log_error("chdir(%s) failed: %m", arg_directory);
1325                                 goto child_fail;
1326                         }
1327
1328                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1329                                 log_error("mount(MS_MOVE) failed: %m");
1330                                 goto child_fail;
1331                         }
1332
1333                         if (chroot(".") < 0) {
1334                                 log_error("chroot() failed: %m");
1335                                 goto child_fail;
1336                         }
1337
1338                         if (chdir("/") < 0) {
1339                                 log_error("chdir() failed: %m");
1340                                 goto child_fail;
1341                         }
1342
1343                         umask(0022);
1344
1345                         loopback_setup();
1346
1347                         if (drop_capabilities() < 0) {
1348                                 log_error("drop_capabilities() failed: %m");
1349                                 goto child_fail;
1350                         }
1351
1352                         if (arg_user) {
1353
1354                                 /* Note that this resolves user names
1355                                  * inside the container, and hence
1356                                  * accesses the NSS modules from the
1357                                  * container and not the host. This is
1358                                  * a bit weird... */
1359
1360                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1361                                         log_error("get_user_creds() failed: %m");
1362                                         goto child_fail;
1363                                 }
1364
1365                                 if (mkdir_parents_label(home, 0775) < 0) {
1366                                         log_error("mkdir_parents_label() failed: %m");
1367                                         goto child_fail;
1368                                 }
1369
1370                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1371                                         log_error("mkdir_safe_label() failed: %m");
1372                                         goto child_fail;
1373                                 }
1374
1375                                 if (initgroups((const char*)arg_user, gid) < 0) {
1376                                         log_error("initgroups() failed: %m");
1377                                         goto child_fail;
1378                                 }
1379
1380                                 if (setresgid(gid, gid, gid) < 0) {
1381                                         log_error("setregid() failed: %m");
1382                                         goto child_fail;
1383                                 }
1384
1385                                 if (setresuid(uid, uid, uid) < 0) {
1386                                         log_error("setreuid() failed: %m");
1387                                         goto child_fail;
1388                                 }
1389                         } else {
1390                                 /* Reset everything fully to 0, just in case */
1391
1392                                 if (setgroups(0, NULL) < 0) {
1393                                         log_error("setgroups() failed: %m");
1394                                         goto child_fail;
1395                                 }
1396
1397                                 if (setresgid(0, 0, 0) < 0) {
1398                                         log_error("setregid() failed: %m");
1399                                         goto child_fail;
1400                                 }
1401
1402                                 if (setresuid(0, 0, 0) < 0) {
1403                                         log_error("setreuid() failed: %m");
1404                                         goto child_fail;
1405                                 }
1406                         }
1407
1408                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1409                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1410                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1411                                 log_oom();
1412                                 goto child_fail;
1413                         }
1414
1415                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1416                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1417                                         log_oom();
1418                                         goto child_fail;
1419                                 }
1420                         }
1421
1422                         if (fdset_size(fds) > 0) {
1423                                 k = fdset_cloexec(fds, false);
1424                                 if (k < 0) {
1425                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1426                                         goto child_fail;
1427                                 }
1428
1429                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1430                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1431                                         log_oom();
1432                                         goto child_fail;
1433                                 }
1434                         }
1435
1436                         setup_hostname();
1437
1438                         if (arg_boot) {
1439                                 char **a;
1440                                 size_t l;
1441
1442                                 /* Automatically search for the init system */
1443
1444                                 l = 1 + argc - optind;
1445                                 a = newa(char*, l + 1);
1446                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1447
1448                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1449                                 execve(a[0], a, (char**) envp);
1450
1451                                 a[0] = (char*) "/lib/systemd/systemd";
1452                                 execve(a[0], a, (char**) envp);
1453
1454                                 a[0] = (char*) "/sbin/init";
1455                                 execve(a[0], a, (char**) envp);
1456                         } else if (argc > optind)
1457                                 execvpe(argv[optind], argv + optind, (char**) envp);
1458                         else {
1459                                 chdir(home ? home : "/root");
1460                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1461                         }
1462
1463                         log_error("execv() failed: %m");
1464
1465                 child_fail:
1466                         _exit(EXIT_FAILURE);
1467                 }
1468
1469                 fdset_free(fds);
1470                 fds = NULL;
1471
1472                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1473                 if (k < 0) {
1474                         r = EXIT_FAILURE;
1475                         break;
1476                 }
1477
1478                 putc('\n', stdout);
1479
1480                 /* Kill if it is not dead yet anyway */
1481                 terminate_machine(pid);
1482
1483                 /* Redundant, but better safe than sorry */
1484                 kill(pid, SIGKILL);
1485
1486                 k = wait_for_terminate(pid, &status);
1487                 pid = 0;
1488
1489                 if (k < 0) {
1490                         r = EXIT_FAILURE;
1491                         break;
1492                 }
1493
1494                 if (status.si_code == CLD_EXITED) {
1495                         r = status.si_status;
1496                         if (status.si_status != 0) {
1497                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1498                                 break;
1499                         }
1500
1501                         log_debug("Container %s exited successfully.", arg_machine);
1502                         break;
1503                 } else if (status.si_code == CLD_KILLED &&
1504                            status.si_status == SIGINT) {
1505                         log_info("Container %s has been shut down.", arg_machine);
1506                         r = 0;
1507                         break;
1508                 } else if (status.si_code == CLD_KILLED &&
1509                            status.si_status == SIGHUP) {
1510                         log_info("Container %s is being rebooted.", arg_machine);
1511                         continue;
1512                 } else if (status.si_code == CLD_KILLED ||
1513                            status.si_code == CLD_DUMPED) {
1514
1515                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1516                         r = EXIT_FAILURE;
1517                         break;
1518                 } else {
1519                         log_error("Container %s failed due to unknown reason.", arg_machine);
1520                         r = EXIT_FAILURE;
1521                         break;
1522                 }
1523         }
1524
1525 finish:
1526         if (pid > 0)
1527                 kill(pid, SIGKILL);
1528
1529         free(arg_directory);
1530         free(arg_machine);
1531
1532         return r;
1533 }