chiark / gitweb /
nspawn: do not invoke RegisterMachine on machined from inside the new PID namespace
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44
45 #include "sd-daemon.h"
46 #include "sd-bus.h"
47 #include "sd-id128.h"
48 #include "log.h"
49 #include "util.h"
50 #include "mkdir.h"
51 #include "macro.h"
52 #include "audit.h"
53 #include "missing.h"
54 #include "cgroup-util.h"
55 #include "strv.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62 #include "bus-util.h"
63 #include "bus-error.h"
64 #include "ptyfwd.h"
65 #include "bus-kernel.h"
66 #include "env-util.h"
67
68 #ifndef TTY_GID
69 #define TTY_GID 5
70 #endif
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static const char *arg_slice = NULL;
84 static bool arg_private_network = false;
85 static bool arg_read_only = false;
86 static bool arg_boot = false;
87 static LinkJournal arg_link_journal = LINK_AUTO;
88 static uint64_t arg_retain =
89         (1ULL << CAP_CHOWN) |
90         (1ULL << CAP_DAC_OVERRIDE) |
91         (1ULL << CAP_DAC_READ_SEARCH) |
92         (1ULL << CAP_FOWNER) |
93         (1ULL << CAP_FSETID) |
94         (1ULL << CAP_IPC_OWNER) |
95         (1ULL << CAP_KILL) |
96         (1ULL << CAP_LEASE) |
97         (1ULL << CAP_LINUX_IMMUTABLE) |
98         (1ULL << CAP_NET_BIND_SERVICE) |
99         (1ULL << CAP_NET_BROADCAST) |
100         (1ULL << CAP_NET_RAW) |
101         (1ULL << CAP_SETGID) |
102         (1ULL << CAP_SETFCAP) |
103         (1ULL << CAP_SETPCAP) |
104         (1ULL << CAP_SETUID) |
105         (1ULL << CAP_SYS_ADMIN) |
106         (1ULL << CAP_SYS_CHROOT) |
107         (1ULL << CAP_SYS_NICE) |
108         (1ULL << CAP_SYS_PTRACE) |
109         (1ULL << CAP_SYS_TTY_CONFIG) |
110         (1ULL << CAP_SYS_RESOURCE) |
111         (1ULL << CAP_SYS_BOOT) |
112         (1ULL << CAP_AUDIT_WRITE) |
113         (1ULL << CAP_AUDIT_CONTROL);
114 static char **arg_bind = NULL;
115 static char **arg_bind_ro = NULL;
116 static char **arg_setenv = NULL;
117
118 static int help(void) {
119
120         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
121                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
122                "  -h --help                Show this help\n"
123                "     --version             Print version string\n"
124                "  -D --directory=NAME      Root directory for the container\n"
125                "  -b --boot                Boot up full system (i.e. invoke init)\n"
126                "  -u --user=USER           Run the command under specified user or uid\n"
127                "     --uuid=UUID           Set a specific machine UUID for the container\n"
128                "  -M --machine=NAME        Set the machine name for the container\n"
129                "  -S --slice=SLICE         Place the container in the specified slice\n"
130                "     --private-network     Disable network in container\n"
131                "     --read-only           Mount the root directory read-only\n"
132                "     --capability=CAP      In addition to the default, retain specified\n"
133                "                           capability\n"
134                "     --drop-capability=CAP Drop the specified capability from the default set\n"
135                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
136                "  -j                       Equivalent to --link-journal=host\n"
137                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
138                "                           the container\n"
139                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
140                "     --setenv=NAME=VALUE   Pass an environment variable to PID 1\n",
141                program_invocation_short_name);
142
143         return 0;
144 }
145
146 static int parse_argv(int argc, char *argv[]) {
147
148         enum {
149                 ARG_VERSION = 0x100,
150                 ARG_PRIVATE_NETWORK,
151                 ARG_UUID,
152                 ARG_READ_ONLY,
153                 ARG_CAPABILITY,
154                 ARG_DROP_CAPABILITY,
155                 ARG_LINK_JOURNAL,
156                 ARG_BIND,
157                 ARG_BIND_RO,
158                 ARG_SETENV,
159         };
160
161         static const struct option options[] = {
162                 { "help",            no_argument,       NULL, 'h'                 },
163                 { "version",         no_argument,       NULL, ARG_VERSION         },
164                 { "directory",       required_argument, NULL, 'D'                 },
165                 { "user",            required_argument, NULL, 'u'                 },
166                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
167                 { "boot",            no_argument,       NULL, 'b'                 },
168                 { "uuid",            required_argument, NULL, ARG_UUID            },
169                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
170                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
171                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
172                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
173                 { "bind",            required_argument, NULL, ARG_BIND            },
174                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
175                 { "machine",         required_argument, NULL, 'M'                 },
176                 { "slice",           required_argument, NULL, 'S'                 },
177                 { "setenv",          required_argument, NULL, ARG_SETENV          },
178                 {}
179         };
180
181         int c, r;
182
183         assert(argc >= 0);
184         assert(argv);
185
186         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
187
188                 switch (c) {
189
190                 case 'h':
191                         return help();
192
193                 case ARG_VERSION:
194                         puts(PACKAGE_STRING);
195                         puts(SYSTEMD_FEATURES);
196                         return 0;
197
198                 case 'D':
199                         free(arg_directory);
200                         arg_directory = canonicalize_file_name(optarg);
201                         if (!arg_directory) {
202                                 log_error("Invalid root directory: %m");
203                                 return -ENOMEM;
204                         }
205
206                         break;
207
208                 case 'u':
209                         free(arg_user);
210                         arg_user = strdup(optarg);
211                         if (!arg_user)
212                                 return log_oom();
213
214                         break;
215
216                 case ARG_PRIVATE_NETWORK:
217                         arg_private_network = true;
218                         break;
219
220                 case 'b':
221                         arg_boot = true;
222                         break;
223
224                 case ARG_UUID:
225                         r = sd_id128_from_string(optarg, &arg_uuid);
226                         if (r < 0) {
227                                 log_error("Invalid UUID: %s", optarg);
228                                 return r;
229                         }
230                         break;
231
232                 case 'S':
233                         arg_slice = strdup(optarg);
234                         if (!arg_slice)
235                                 return log_oom();
236
237                         break;
238
239                 case 'M':
240                         if (!hostname_is_valid(optarg)) {
241                                 log_error("Invalid machine name: %s", optarg);
242                                 return -EINVAL;
243                         }
244
245                         free(arg_machine);
246                         arg_machine = strdup(optarg);
247                         if (!arg_machine)
248                                 return log_oom();
249
250                         break;
251
252                 case ARG_READ_ONLY:
253                         arg_read_only = true;
254                         break;
255
256                 case ARG_CAPABILITY:
257                 case ARG_DROP_CAPABILITY: {
258                         char *state, *word;
259                         size_t length;
260
261                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
262                                 cap_value_t cap;
263                                 char *t;
264
265                                 t = strndup(word, length);
266                                 if (!t)
267                                         return log_oom();
268
269                                 if (cap_from_name(t, &cap) < 0) {
270                                         log_error("Failed to parse capability %s.", t);
271                                         free(t);
272                                         return -EINVAL;
273                                 }
274
275                                 free(t);
276
277                                 if (c == ARG_CAPABILITY)
278                                         arg_retain |= 1ULL << (uint64_t) cap;
279                                 else
280                                         arg_retain &= ~(1ULL << (uint64_t) cap);
281                         }
282
283                         break;
284                 }
285
286                 case 'j':
287                         arg_link_journal = LINK_GUEST;
288                         break;
289
290                 case ARG_LINK_JOURNAL:
291                         if (streq(optarg, "auto"))
292                                 arg_link_journal = LINK_AUTO;
293                         else if (streq(optarg, "no"))
294                                 arg_link_journal = LINK_NO;
295                         else if (streq(optarg, "guest"))
296                                 arg_link_journal = LINK_GUEST;
297                         else if (streq(optarg, "host"))
298                                 arg_link_journal = LINK_HOST;
299                         else {
300                                 log_error("Failed to parse link journal mode %s", optarg);
301                                 return -EINVAL;
302                         }
303
304                         break;
305
306                 case ARG_BIND:
307                 case ARG_BIND_RO: {
308                         _cleanup_free_ char *a = NULL, *b = NULL;
309                         char *e;
310                         char ***x;
311
312                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
313
314                         e = strchr(optarg, ':');
315                         if (e) {
316                                 a = strndup(optarg, e - optarg);
317                                 b = strdup(e + 1);
318                         } else {
319                                 a = strdup(optarg);
320                                 b = strdup(optarg);
321                         }
322
323                         if (!a || !b)
324                                 return log_oom();
325
326                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
327                                 log_error("Invalid bind mount specification: %s", optarg);
328                                 return -EINVAL;
329                         }
330
331                         r = strv_extend(x, a);
332                         if (r < 0)
333                                 return log_oom();
334
335                         r = strv_extend(x, b);
336                         if (r < 0)
337                                 return log_oom();
338
339                         break;
340                 }
341
342                 case ARG_SETENV: {
343                         char **n;
344
345                         if (!env_assignment_is_valid(optarg)) {
346                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
347                                 return -EINVAL;
348                         }
349
350                         n = strv_env_set(arg_setenv, optarg);
351                         if (!n)
352                                 return log_oom();
353
354                         strv_free(arg_setenv);
355                         arg_setenv = n;
356                         break;
357                 }
358
359                 case '?':
360                         return -EINVAL;
361
362                 default:
363                         assert_not_reached("Unhandled option");
364                 }
365         }
366
367         return 1;
368 }
369
370 static int mount_all(const char *dest) {
371
372         typedef struct MountPoint {
373                 const char *what;
374                 const char *where;
375                 const char *type;
376                 const char *options;
377                 unsigned long flags;
378                 bool fatal;
379         } MountPoint;
380
381         static const MountPoint mount_table[] = {
382                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
383                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
384                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
385                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
386                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
387                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
388                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
389                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
390 #ifdef HAVE_SELINUX
391                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
392                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
393 #endif
394         };
395
396         unsigned k;
397         int r = 0;
398
399         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
400                 _cleanup_free_ char *where = NULL;
401                 int t;
402
403                 where = strjoin(dest, "/", mount_table[k].where, NULL);
404                 if (!where)
405                         return log_oom();
406
407                 t = path_is_mount_point(where, true);
408                 if (t < 0) {
409                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
410
411                         if (r == 0)
412                                 r = t;
413
414                         continue;
415                 }
416
417                 /* Skip this entry if it is not a remount. */
418                 if (mount_table[k].what && t > 0)
419                         continue;
420
421                 mkdir_p(where, 0755);
422
423                 if (mount(mount_table[k].what,
424                           where,
425                           mount_table[k].type,
426                           mount_table[k].flags,
427                           mount_table[k].options) < 0 &&
428                     mount_table[k].fatal) {
429
430                         log_error("mount(%s) failed: %m", where);
431
432                         if (r == 0)
433                                 r = -errno;
434                 }
435         }
436
437         return r;
438 }
439
440 static int mount_binds(const char *dest, char **l, unsigned long flags) {
441         char **x, **y;
442
443         STRV_FOREACH_PAIR(x, y, l) {
444                 char *where;
445                 struct stat source_st, dest_st;
446                 int r;
447
448                 if (stat(*x, &source_st) < 0) {
449                         log_error("failed to stat %s: %m", *x);
450                         return -errno;
451                 }
452
453                 where = strappenda(dest, *y);
454                 r = stat(where, &dest_st);
455                 if (r == 0) {
456                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
457                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
458                                                 *x, where);
459                                 return -EINVAL;
460                         }
461                 } else if (errno == ENOENT) {
462                         r = mkdir_parents_label(where, 0755);
463                         if (r < 0) {
464                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
465                                 return r;
466                         }
467                 } else {
468                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
469                         return -errno;
470                 }
471                 /* Create the mount point, but be conservative -- refuse to create block
472                 * and char devices. */
473                 if (S_ISDIR(source_st.st_mode))
474                         mkdir_label(where, 0755);
475                 else if (S_ISFIFO(source_st.st_mode))
476                         mkfifo(where, 0644);
477                 else if (S_ISSOCK(source_st.st_mode))
478                         mknod(where, 0644 | S_IFSOCK, 0);
479                 else if (S_ISREG(source_st.st_mode))
480                         touch(where);
481                 else {
482                         log_error("Refusing to create mountpoint for file: %s", *x);
483                         return -ENOTSUP;
484                 }
485
486                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
487                         log_error("mount(%s) failed: %m", where);
488                         return -errno;
489                 }
490
491                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
492                         log_error("mount(%s) failed: %m", where);
493                         return -errno;
494                 }
495         }
496
497         return 0;
498 }
499
500 static int setup_timezone(const char *dest) {
501         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
502         char *z, *y;
503         int r;
504
505         assert(dest);
506
507         /* Fix the timezone, if possible */
508         r = readlink_malloc("/etc/localtime", &p);
509         if (r < 0) {
510                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
511                 return 0;
512         }
513
514         z = path_startswith(p, "../usr/share/zoneinfo/");
515         if (!z)
516                 z = path_startswith(p, "/usr/share/zoneinfo/");
517         if (!z) {
518                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
519                 return 0;
520         }
521
522         where = strappend(dest, "/etc/localtime");
523         if (!where)
524                 return log_oom();
525
526         r = readlink_malloc(where, &q);
527         if (r >= 0) {
528                 y = path_startswith(q, "../usr/share/zoneinfo/");
529                 if (!y)
530                         y = path_startswith(q, "/usr/share/zoneinfo/");
531
532
533                 /* Already pointing to the right place? Then do nothing .. */
534                 if (y && streq(y, z))
535                         return 0;
536         }
537
538         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
539         if (!check)
540                 return log_oom();
541
542         if (access(check, F_OK) < 0) {
543                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
544                 return 0;
545         }
546
547         what = strappend("../usr/share/zoneinfo/", z);
548         if (!what)
549                 return log_oom();
550
551         unlink(where);
552         if (symlink(what, where) < 0) {
553                 log_error("Failed to correct timezone of container: %m");
554                 return 0;
555         }
556
557         return 0;
558 }
559
560 static int setup_resolv_conf(const char *dest) {
561         char _cleanup_free_ *where = NULL;
562
563         assert(dest);
564
565         if (arg_private_network)
566                 return 0;
567
568         /* Fix resolv.conf, if possible */
569         where = strappend(dest, "/etc/resolv.conf");
570         if (!where)
571                 return log_oom();
572
573         /* We don't really care for the results of this really. If it
574          * fails, it fails, but meh... */
575         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
576
577         return 0;
578 }
579
580 static int setup_boot_id(const char *dest) {
581         _cleanup_free_ char *from = NULL, *to = NULL;
582         sd_id128_t rnd;
583         char as_uuid[37];
584         int r;
585
586         assert(dest);
587
588         /* Generate a new randomized boot ID, so that each boot-up of
589          * the container gets a new one */
590
591         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
592         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
593         if (!from || !to)
594                 return log_oom();
595
596         r = sd_id128_randomize(&rnd);
597         if (r < 0) {
598                 log_error("Failed to generate random boot id: %s", strerror(-r));
599                 return r;
600         }
601
602         snprintf(as_uuid, sizeof(as_uuid),
603                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
604                  SD_ID128_FORMAT_VAL(rnd));
605         char_array_0(as_uuid);
606
607         r = write_string_file(from, as_uuid);
608         if (r < 0) {
609                 log_error("Failed to write boot id: %s", strerror(-r));
610                 return r;
611         }
612
613         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
614                 log_error("Failed to bind mount boot id: %m");
615                 r = -errno;
616         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
617                 log_warning("Failed to make boot id read-only: %m");
618
619         unlink(from);
620         return r;
621 }
622
623 static int copy_devnodes(const char *dest) {
624
625         static const char devnodes[] =
626                 "null\0"
627                 "zero\0"
628                 "full\0"
629                 "random\0"
630                 "urandom\0"
631                 "tty\0";
632
633         const char *d;
634         int r = 0;
635         _cleanup_umask_ mode_t u;
636
637         assert(dest);
638
639         u = umask(0000);
640
641         NULSTR_FOREACH(d, devnodes) {
642                 struct stat st;
643                 _cleanup_free_ char *from = NULL, *to = NULL;
644
645                 asprintf(&from, "/dev/%s", d);
646                 asprintf(&to, "%s/dev/%s", dest, d);
647
648                 if (!from || !to) {
649                         log_oom();
650
651                         if (r == 0)
652                                 r = -ENOMEM;
653
654                         break;
655                 }
656
657                 if (stat(from, &st) < 0) {
658
659                         if (errno != ENOENT) {
660                                 log_error("Failed to stat %s: %m", from);
661                                 if (r == 0)
662                                         r = -errno;
663                         }
664
665                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
666
667                         log_error("%s is not a char or block device, cannot copy", from);
668                         if (r == 0)
669                                 r = -EIO;
670
671                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
672
673                         log_error("mknod(%s) failed: %m", dest);
674                         if (r == 0)
675                                 r = -errno;
676                 }
677         }
678
679         return r;
680 }
681
682 static int setup_ptmx(const char *dest) {
683         _cleanup_free_ char *p = NULL;
684
685         p = strappend(dest, "/dev/ptmx");
686         if (!p)
687                 return log_oom();
688
689         if (symlink("pts/ptmx", p) < 0) {
690                 log_error("Failed to create /dev/ptmx symlink: %m");
691                 return -errno;
692         }
693
694         return 0;
695 }
696
697 static int setup_dev_console(const char *dest, const char *console) {
698         struct stat st;
699         _cleanup_free_ char *to = NULL;
700         int r;
701         _cleanup_umask_ mode_t u;
702
703         assert(dest);
704         assert(console);
705
706         u = umask(0000);
707
708         if (stat(console, &st) < 0) {
709                 log_error("Failed to stat %s: %m", console);
710                 return -errno;
711
712         } else if (!S_ISCHR(st.st_mode)) {
713                 log_error("/dev/console is not a char device");
714                 return -EIO;
715         }
716
717         r = chmod_and_chown(console, 0600, 0, 0);
718         if (r < 0) {
719                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
720                 return r;
721         }
722
723         if (asprintf(&to, "%s/dev/console", dest) < 0)
724                 return log_oom();
725
726         /* We need to bind mount the right tty to /dev/console since
727          * ptys can only exist on pts file systems. To have something
728          * to bind mount things on we create a device node first, that
729          * has the right major/minor (note that the major minor
730          * doesn't actually matter here, since we mount it over
731          * anyway). */
732
733         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
734                 log_error("mknod() for /dev/console failed: %m");
735                 return -errno;
736         }
737
738         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
739                 log_error("Bind mount for /dev/console failed: %m");
740                 return -errno;
741         }
742
743         return 0;
744 }
745
746 static int setup_kmsg(const char *dest, int kmsg_socket) {
747         _cleanup_free_ char *from = NULL, *to = NULL;
748         int r, fd, k;
749         _cleanup_umask_ mode_t u;
750         union {
751                 struct cmsghdr cmsghdr;
752                 uint8_t buf[CMSG_SPACE(sizeof(int))];
753         } control = {};
754         struct msghdr mh = {
755                 .msg_control = &control,
756                 .msg_controllen = sizeof(control),
757         };
758         struct cmsghdr *cmsg;
759
760         assert(dest);
761         assert(kmsg_socket >= 0);
762
763         u = umask(0000);
764
765         /* We create the kmsg FIFO as /dev/kmsg, but immediately
766          * delete it after bind mounting it to /proc/kmsg. While FIFOs
767          * on the reading side behave very similar to /proc/kmsg,
768          * their writing side behaves differently from /dev/kmsg in
769          * that writing blocks when nothing is reading. In order to
770          * avoid any problems with containers deadlocking due to this
771          * we simply make /dev/kmsg unavailable to the container. */
772         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
773             asprintf(&to, "%s/proc/kmsg", dest) < 0)
774                 return log_oom();
775
776         if (mkfifo(from, 0600) < 0) {
777                 log_error("mkfifo() for /dev/kmsg failed: %m");
778                 return -errno;
779         }
780
781         r = chmod_and_chown(from, 0600, 0, 0);
782         if (r < 0) {
783                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
784                 return r;
785         }
786
787         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
788                 log_error("Bind mount for /proc/kmsg failed: %m");
789                 return -errno;
790         }
791
792         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
793         if (fd < 0) {
794                 log_error("Failed to open fifo: %m");
795                 return -errno;
796         }
797
798         cmsg = CMSG_FIRSTHDR(&mh);
799         cmsg->cmsg_level = SOL_SOCKET;
800         cmsg->cmsg_type = SCM_RIGHTS;
801         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
802         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
803
804         mh.msg_controllen = cmsg->cmsg_len;
805
806         /* Store away the fd in the socket, so that it stays open as
807          * long as we run the child */
808         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
809         close_nointr_nofail(fd);
810
811         if (k < 0) {
812                 log_error("Failed to send FIFO fd: %m");
813                 return -errno;
814         }
815
816         /* And now make the FIFO unavailable as /dev/kmsg... */
817         unlink(from);
818         return 0;
819 }
820
821 static int setup_hostname(void) {
822
823         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
824                 return -errno;
825
826         return 0;
827 }
828
829 static int setup_journal(const char *directory) {
830         sd_id128_t machine_id, this_id;
831         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
832         char *id;
833         int r;
834
835         p = strappend(directory, "/etc/machine-id");
836         if (!p)
837                 return log_oom();
838
839         r = read_one_line_file(p, &b);
840         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
841                 return 0;
842         else if (r < 0) {
843                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
844                 return r;
845         }
846
847         id = strstrip(b);
848         if (isempty(id) && arg_link_journal == LINK_AUTO)
849                 return 0;
850
851         /* Verify validity */
852         r = sd_id128_from_string(id, &machine_id);
853         if (r < 0) {
854                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
855                 return r;
856         }
857
858         r = sd_id128_get_machine(&this_id);
859         if (r < 0) {
860                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
861                 return r;
862         }
863
864         if (sd_id128_equal(machine_id, this_id)) {
865                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
866                          "Host and machine ids are equal (%s): refusing to link journals", id);
867                 if (arg_link_journal == LINK_AUTO)
868                         return 0;
869                 return
870                         -EEXIST;
871         }
872
873         if (arg_link_journal == LINK_NO)
874                 return 0;
875
876         free(p);
877         p = strappend("/var/log/journal/", id);
878         q = strjoin(directory, "/var/log/journal/", id, NULL);
879         if (!p || !q)
880                 return log_oom();
881
882         if (path_is_mount_point(p, false) > 0) {
883                 if (arg_link_journal != LINK_AUTO) {
884                         log_error("%s: already a mount point, refusing to use for journal", p);
885                         return -EEXIST;
886                 }
887
888                 return 0;
889         }
890
891         if (path_is_mount_point(q, false) > 0) {
892                 if (arg_link_journal != LINK_AUTO) {
893                         log_error("%s: already a mount point, refusing to use for journal", q);
894                         return -EEXIST;
895                 }
896
897                 return 0;
898         }
899
900         r = readlink_and_make_absolute(p, &d);
901         if (r >= 0) {
902                 if ((arg_link_journal == LINK_GUEST ||
903                      arg_link_journal == LINK_AUTO) &&
904                     path_equal(d, q)) {
905
906                         r = mkdir_p(q, 0755);
907                         if (r < 0)
908                                 log_warning("failed to create directory %s: %m", q);
909                         return 0;
910                 }
911
912                 if (unlink(p) < 0) {
913                         log_error("Failed to remove symlink %s: %m", p);
914                         return -errno;
915                 }
916         } else if (r == -EINVAL) {
917
918                 if (arg_link_journal == LINK_GUEST &&
919                     rmdir(p) < 0) {
920
921                         if (errno == ENOTDIR) {
922                                 log_error("%s already exists and is neither a symlink nor a directory", p);
923                                 return r;
924                         } else {
925                                 log_error("Failed to remove %s: %m", p);
926                                 return -errno;
927                         }
928                 }
929         } else if (r != -ENOENT) {
930                 log_error("readlink(%s) failed: %m", p);
931                 return r;
932         }
933
934         if (arg_link_journal == LINK_GUEST) {
935
936                 if (symlink(q, p) < 0) {
937                         log_error("Failed to symlink %s to %s: %m", q, p);
938                         return -errno;
939                 }
940
941                 r = mkdir_p(q, 0755);
942                 if (r < 0)
943                         log_warning("failed to create directory %s: %m", q);
944                 return 0;
945         }
946
947         if (arg_link_journal == LINK_HOST) {
948                 r = mkdir_p(p, 0755);
949                 if (r < 0) {
950                         log_error("Failed to create %s: %m", p);
951                         return r;
952                 }
953
954         } else if (access(p, F_OK) < 0)
955                 return 0;
956
957         if (dir_is_empty(q) == 0) {
958                 log_error("%s not empty.", q);
959                 return -ENOTEMPTY;
960         }
961
962         r = mkdir_p(q, 0755);
963         if (r < 0) {
964                 log_error("Failed to create %s: %m", q);
965                 return r;
966         }
967
968         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
969                 log_error("Failed to bind mount journal from host into guest: %m");
970                 return -errno;
971         }
972
973         return 0;
974 }
975
976 static int setup_kdbus(const char *dest, const char *path) {
977         const char *p;
978
979         if (!path)
980                 return 0;
981
982         p = strappenda(dest, "/dev/kdbus");
983         if (mkdir(p, 0755) < 0) {
984                 log_error("Failed to create kdbus path: %m");
985                 return  -errno;
986         }
987
988         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
989                 log_error("Failed to mount kdbus namespace path: %m");
990                 return -errno;
991         }
992
993         return 0;
994 }
995
996 static int drop_capabilities(void) {
997         return capability_bounding_set_drop(~arg_retain, false);
998 }
999
1000 static int register_machine(pid_t pid) {
1001         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1002         _cleanup_bus_unref_ sd_bus *bus = NULL;
1003         int r;
1004
1005         r = sd_bus_open_system(&bus);
1006         if (r < 0) {
1007                 log_error("Failed to open system bus: %s", strerror(-r));
1008                 return r;
1009         }
1010
1011         r = sd_bus_call_method(
1012                         bus,
1013                         "org.freedesktop.machine1",
1014                         "/org/freedesktop/machine1",
1015                         "org.freedesktop.machine1.Manager",
1016                         "CreateMachine",
1017                         &error,
1018                         NULL,
1019                         "sayssusa(sv)",
1020                         arg_machine,
1021                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1022                         "nspawn",
1023                         "container",
1024                         (uint32_t) pid,
1025                         strempty(arg_directory),
1026                         !isempty(arg_slice), "Slice", "s", arg_slice);
1027         if (r < 0) {
1028                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1029                 return r;
1030         }
1031
1032         return 0;
1033 }
1034
1035 static int terminate_machine(pid_t pid) {
1036         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1037         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1038         _cleanup_bus_unref_ sd_bus *bus = NULL;
1039         const char *path;
1040         int r;
1041
1042         r = sd_bus_default_system(&bus);
1043         if (r < 0) {
1044                 log_error("Failed to open system bus: %s", strerror(-r));
1045                 return r;
1046         }
1047
1048         r = sd_bus_call_method(
1049                         bus,
1050                         "org.freedesktop.machine1",
1051                         "/org/freedesktop/machine1",
1052                         "org.freedesktop.machine1.Manager",
1053                         "GetMachineByPID",
1054                         &error,
1055                         &reply,
1056                         "u",
1057                         (uint32_t) pid);
1058         if (r < 0) {
1059                 /* Note that the machine might already have been
1060                  * cleaned up automatically, hence don't consider it a
1061                  * failure if we cannot get the machine object. */
1062                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1063                 return 0;
1064         }
1065
1066         r = sd_bus_message_read(reply, "o", &path);
1067         if (r < 0)
1068                 return bus_log_parse_error(r);
1069
1070         r = sd_bus_call_method(
1071                         bus,
1072                         "org.freedesktop.machine1",
1073                         path,
1074                         "org.freedesktop.machine1.Machine",
1075                         "Terminate",
1076                         &error,
1077                         NULL,
1078                         NULL);
1079         if (r < 0) {
1080                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1081                 return 0;
1082         }
1083
1084         return 0;
1085 }
1086
1087 static bool audit_enabled(void) {
1088         int fd;
1089
1090         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1091         if (fd >= 0) {
1092                 close_nointr_nofail(fd);
1093                 return true;
1094         }
1095         return false;
1096 }
1097
1098 int main(int argc, char *argv[]) {
1099         pid_t pid = 0;
1100         int r = EXIT_FAILURE, k;
1101         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1102         int n_fd_passed;
1103         const char *console = NULL;
1104         sigset_t mask;
1105         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1106         _cleanup_fdset_free_ FDSet *fds = NULL;
1107         _cleanup_free_ char *kdbus_namespace = NULL;
1108         const char *ns;
1109
1110         log_parse_environment();
1111         log_open();
1112
1113         k = parse_argv(argc, argv);
1114         if (k < 0)
1115                 goto finish;
1116         else if (k == 0) {
1117                 r = EXIT_SUCCESS;
1118                 goto finish;
1119         }
1120
1121         if (arg_directory) {
1122                 char *p;
1123
1124                 p = path_make_absolute_cwd(arg_directory);
1125                 free(arg_directory);
1126                 arg_directory = p;
1127         } else
1128                 arg_directory = get_current_dir_name();
1129
1130         if (!arg_directory) {
1131                 log_error("Failed to determine path, please use -D.");
1132                 goto finish;
1133         }
1134
1135         path_kill_slashes(arg_directory);
1136
1137         if (!arg_machine) {
1138                 arg_machine = strdup(basename(arg_directory));
1139                 if (!arg_machine) {
1140                         log_oom();
1141                         goto finish;
1142                 }
1143
1144                 hostname_cleanup(arg_machine, false);
1145                 if (isempty(arg_machine)) {
1146                         log_error("Failed to determine machine name automatically, please use -M.");
1147                         goto finish;
1148                 }
1149         }
1150
1151         if (geteuid() != 0) {
1152                 log_error("Need to be root.");
1153                 goto finish;
1154         }
1155
1156         if (sd_booted() <= 0) {
1157                 log_error("Not running on a systemd system.");
1158                 goto finish;
1159         }
1160
1161         if (arg_boot && audit_enabled()) {
1162                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1163                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1164                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1165                 sleep(5);
1166         }
1167
1168         if (path_equal(arg_directory, "/")) {
1169                 log_error("Spawning container on root directory not supported.");
1170                 goto finish;
1171         }
1172
1173         if (path_is_os_tree(arg_directory) <= 0) {
1174                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1175                 goto finish;
1176         }
1177
1178         log_close();
1179         n_fd_passed = sd_listen_fds(false);
1180         if (n_fd_passed > 0) {
1181                 k = fdset_new_listen_fds(&fds, false);
1182                 if (k < 0) {
1183                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1184                         goto finish;
1185                 }
1186         }
1187         fdset_close_others(fds);
1188         log_open();
1189
1190         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1191         if (master < 0) {
1192                 log_error("Failed to acquire pseudo tty: %m");
1193                 goto finish;
1194         }
1195
1196         console = ptsname(master);
1197         if (!console) {
1198                 log_error("Failed to determine tty name: %m");
1199                 goto finish;
1200         }
1201
1202         log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1203
1204         if (unlockpt(master) < 0) {
1205                 log_error("Failed to unlock tty: %m");
1206                 goto finish;
1207         }
1208
1209         ns = strappenda("machine-", arg_machine);
1210         kdbus_fd = bus_kernel_create_namespace(ns, &kdbus_namespace);
1211         if (r < 0)
1212                 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1213         else
1214                 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1215
1216         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1217                 log_error("Failed to create kmsg socket pair: %m");
1218                 goto finish;
1219         }
1220
1221         sync_fd = eventfd(0, EFD_CLOEXEC);
1222         if (sync_fd < 0) {
1223                 log_error("Failed to create event fd: %m");
1224                 goto finish;
1225         }
1226
1227         sd_notify(0, "READY=1");
1228
1229         assert_se(sigemptyset(&mask) == 0);
1230         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1231         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1232
1233         for (;;) {
1234                 siginfo_t status;
1235
1236                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1237                 if (pid < 0) {
1238                         if (errno == EINVAL)
1239                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1240                         else
1241                                 log_error("clone() failed: %m");
1242
1243                         goto finish;
1244                 }
1245
1246                 if (pid == 0) {
1247                         /* child */
1248                         const char *home = NULL;
1249                         uid_t uid = (uid_t) -1;
1250                         gid_t gid = (gid_t) -1;
1251                         unsigned n_env = 2;
1252                         const char *envp[] = {
1253                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1254                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1255                                 NULL, /* TERM */
1256                                 NULL, /* HOME */
1257                                 NULL, /* USER */
1258                                 NULL, /* LOGNAME */
1259                                 NULL, /* container_uuid */
1260                                 NULL, /* LISTEN_FDS */
1261                                 NULL, /* LISTEN_PID */
1262                                 NULL
1263                         };
1264                         char **env_use;
1265                         eventfd_t x;
1266
1267                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1268                         if (envp[n_env])
1269                                 n_env ++;
1270
1271                         close_nointr_nofail(master);
1272                         master = -1;
1273
1274                         close_nointr(STDIN_FILENO);
1275                         close_nointr(STDOUT_FILENO);
1276                         close_nointr(STDERR_FILENO);
1277
1278                         close_nointr_nofail(kmsg_socket_pair[0]);
1279                         kmsg_socket_pair[0] = -1;
1280
1281                         reset_all_signal_handlers();
1282
1283                         assert_se(sigemptyset(&mask) == 0);
1284                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1285
1286                         k = open_terminal(console, O_RDWR);
1287                         if (k != STDIN_FILENO) {
1288                                 if (k >= 0) {
1289                                         close_nointr_nofail(k);
1290                                         k = -EINVAL;
1291                                 }
1292
1293                                 log_error("Failed to open console: %s", strerror(-k));
1294                                 goto child_fail;
1295                         }
1296
1297                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1298                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1299                                 log_error("Failed to duplicate console: %m");
1300                                 goto child_fail;
1301                         }
1302
1303                         if (setsid() < 0) {
1304                                 log_error("setsid() failed: %m");
1305                                 goto child_fail;
1306                         }
1307
1308                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1309                                 log_error("PR_SET_PDEATHSIG failed: %m");
1310                                 goto child_fail;
1311                         }
1312
1313                         /* Mark everything as slave, so that we still
1314                          * receive mounts from the real root, but don't
1315                          * propagate mounts to the real root. */
1316                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1317                                 log_error("MS_SLAVE|MS_REC failed: %m");
1318                                 goto child_fail;
1319                         }
1320
1321                         /* Turn directory into bind mount */
1322                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1323                                 log_error("Failed to make bind mount.");
1324                                 goto child_fail;
1325                         }
1326
1327                         if (arg_read_only)
1328                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1329                                         log_error("Failed to make read-only.");
1330                                         goto child_fail;
1331                                 }
1332
1333                         if (mount_all(arg_directory) < 0)
1334                                 goto child_fail;
1335
1336                         if (copy_devnodes(arg_directory) < 0)
1337                                 goto child_fail;
1338
1339                         if (setup_ptmx(arg_directory) < 0)
1340                                 goto child_fail;
1341
1342                         dev_setup(arg_directory);
1343
1344                         if (setup_dev_console(arg_directory, console) < 0)
1345                                 goto child_fail;
1346
1347                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1348                                 goto child_fail;
1349
1350                         close_nointr_nofail(kmsg_socket_pair[1]);
1351                         kmsg_socket_pair[1] = -1;
1352
1353                         if (setup_boot_id(arg_directory) < 0)
1354                                 goto child_fail;
1355
1356                         if (setup_timezone(arg_directory) < 0)
1357                                 goto child_fail;
1358
1359                         if (setup_resolv_conf(arg_directory) < 0)
1360                                 goto child_fail;
1361
1362                         if (setup_journal(arg_directory) < 0)
1363                                 goto child_fail;
1364
1365                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1366                                 goto child_fail;
1367
1368                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1369                                 goto child_fail;
1370
1371                         if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1372                                 goto child_fail;
1373
1374                         if (chdir(arg_directory) < 0) {
1375                                 log_error("chdir(%s) failed: %m", arg_directory);
1376                                 goto child_fail;
1377                         }
1378
1379                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1380                                 log_error("mount(MS_MOVE) failed: %m");
1381                                 goto child_fail;
1382                         }
1383
1384                         if (chroot(".") < 0) {
1385                                 log_error("chroot() failed: %m");
1386                                 goto child_fail;
1387                         }
1388
1389                         if (chdir("/") < 0) {
1390                                 log_error("chdir() failed: %m");
1391                                 goto child_fail;
1392                         }
1393
1394                         umask(0022);
1395
1396                         loopback_setup();
1397
1398                         if (drop_capabilities() < 0) {
1399                                 log_error("drop_capabilities() failed: %m");
1400                                 goto child_fail;
1401                         }
1402
1403                         if (arg_user) {
1404
1405                                 /* Note that this resolves user names
1406                                  * inside the container, and hence
1407                                  * accesses the NSS modules from the
1408                                  * container and not the host. This is
1409                                  * a bit weird... */
1410
1411                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1412                                         log_error("get_user_creds() failed: %m");
1413                                         goto child_fail;
1414                                 }
1415
1416                                 if (mkdir_parents_label(home, 0775) < 0) {
1417                                         log_error("mkdir_parents_label() failed: %m");
1418                                         goto child_fail;
1419                                 }
1420
1421                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1422                                         log_error("mkdir_safe_label() failed: %m");
1423                                         goto child_fail;
1424                                 }
1425
1426                                 if (initgroups((const char*)arg_user, gid) < 0) {
1427                                         log_error("initgroups() failed: %m");
1428                                         goto child_fail;
1429                                 }
1430
1431                                 if (setresgid(gid, gid, gid) < 0) {
1432                                         log_error("setregid() failed: %m");
1433                                         goto child_fail;
1434                                 }
1435
1436                                 if (setresuid(uid, uid, uid) < 0) {
1437                                         log_error("setreuid() failed: %m");
1438                                         goto child_fail;
1439                                 }
1440                         } else {
1441                                 /* Reset everything fully to 0, just in case */
1442
1443                                 if (setgroups(0, NULL) < 0) {
1444                                         log_error("setgroups() failed: %m");
1445                                         goto child_fail;
1446                                 }
1447
1448                                 if (setresgid(0, 0, 0) < 0) {
1449                                         log_error("setregid() failed: %m");
1450                                         goto child_fail;
1451                                 }
1452
1453                                 if (setresuid(0, 0, 0) < 0) {
1454                                         log_error("setreuid() failed: %m");
1455                                         goto child_fail;
1456                                 }
1457                         }
1458
1459                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1460                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1461                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1462                                 log_oom();
1463                                 goto child_fail;
1464                         }
1465
1466                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1467                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1468                                         log_oom();
1469                                         goto child_fail;
1470                                 }
1471                         }
1472
1473                         if (fdset_size(fds) > 0) {
1474                                 k = fdset_cloexec(fds, false);
1475                                 if (k < 0) {
1476                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1477                                         goto child_fail;
1478                                 }
1479
1480                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1481                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1482                                         log_oom();
1483                                         goto child_fail;
1484                                 }
1485                         }
1486
1487                         setup_hostname();
1488
1489                         eventfd_read(sync_fd, &x);
1490                         close_nointr_nofail(sync_fd);
1491                         sync_fd = -1;
1492
1493                         if (!strv_isempty(arg_setenv)) {
1494                                 char **n;
1495
1496                                 n = strv_env_merge(2, envp, arg_setenv);
1497                                 if (!n) {
1498                                         log_oom();
1499                                         goto child_fail;
1500                                 }
1501
1502                                 env_use = n;
1503                         } else
1504                                 env_use = (char**) envp;
1505
1506                         if (arg_boot) {
1507                                 char **a;
1508                                 size_t l;
1509
1510                                 /* Automatically search for the init system */
1511
1512                                 l = 1 + argc - optind;
1513                                 a = newa(char*, l + 1);
1514                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1515
1516                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1517                                 execve(a[0], a, env_use);
1518
1519                                 a[0] = (char*) "/lib/systemd/systemd";
1520                                 execve(a[0], a, env_use);
1521
1522                                 a[0] = (char*) "/sbin/init";
1523                                 execve(a[0], a, env_use);
1524                         } else if (argc > optind)
1525                                 execvpe(argv[optind], argv + optind, env_use);
1526                         else {
1527                                 chdir(home ? home : "/root");
1528                                 execle("/bin/bash", "-bash", NULL, env_use);
1529                         }
1530
1531                         log_error("execv() failed: %m");
1532
1533                 child_fail:
1534                         _exit(EXIT_FAILURE);
1535                 }
1536
1537                 fdset_free(fds);
1538                 fds = NULL;
1539
1540                 r = register_machine(pid);
1541                 if (r < 0)
1542                         goto finish;
1543
1544                 eventfd_write(sync_fd, 1);
1545                 close_nointr_nofail(sync_fd);
1546                 sync_fd = -1;
1547
1548                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1549                 if (k < 0) {
1550                         r = EXIT_FAILURE;
1551                         break;
1552                 }
1553
1554                 putc('\n', stdout);
1555
1556                 /* Kill if it is not dead yet anyway */
1557                 terminate_machine(pid);
1558
1559                 /* Redundant, but better safe than sorry */
1560                 kill(pid, SIGKILL);
1561
1562                 k = wait_for_terminate(pid, &status);
1563                 pid = 0;
1564
1565                 if (k < 0) {
1566                         r = EXIT_FAILURE;
1567                         break;
1568                 }
1569
1570                 if (status.si_code == CLD_EXITED) {
1571                         r = status.si_status;
1572                         if (status.si_status != 0) {
1573                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1574                                 break;
1575                         }
1576
1577                         log_debug("Container %s exited successfully.", arg_machine);
1578                         break;
1579                 } else if (status.si_code == CLD_KILLED &&
1580                            status.si_status == SIGINT) {
1581                         log_info("Container %s has been shut down.", arg_machine);
1582                         r = 0;
1583                         break;
1584                 } else if (status.si_code == CLD_KILLED &&
1585                            status.si_status == SIGHUP) {
1586                         log_info("Container %s is being rebooted.", arg_machine);
1587                         continue;
1588                 } else if (status.si_code == CLD_KILLED ||
1589                            status.si_code == CLD_DUMPED) {
1590
1591                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1592                         r = EXIT_FAILURE;
1593                         break;
1594                 } else {
1595                         log_error("Container %s failed due to unknown reason.", arg_machine);
1596                         r = EXIT_FAILURE;
1597                         break;
1598                 }
1599         }
1600
1601 finish:
1602         if (pid > 0)
1603                 kill(pid, SIGKILL);
1604
1605         free(arg_directory);
1606         free(arg_machine);
1607         free(arg_setenv);
1608
1609         return r;
1610 }