chiark / gitweb /
journal: Drop pkgconfig reference to libsystemd-id128.
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44
45 #include "sd-daemon.h"
46 #include "sd-bus.h"
47 #include "sd-id128.h"
48 #include "log.h"
49 #include "util.h"
50 #include "mkdir.h"
51 #include "macro.h"
52 #include "audit.h"
53 #include "missing.h"
54 #include "cgroup-util.h"
55 #include "strv.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62 #include "bus-util.h"
63 #include "bus-error.h"
64 #include "ptyfwd.h"
65 #include "bus-kernel.h"
66 #include "env-util.h"
67 #include "def.h"
68
69 typedef enum LinkJournal {
70         LINK_NO,
71         LINK_AUTO,
72         LINK_HOST,
73         LINK_GUEST
74 } LinkJournal;
75
76 static char *arg_directory = NULL;
77 static char *arg_user = NULL;
78 static sd_id128_t arg_uuid = {};
79 static char *arg_machine = NULL;
80 static const char *arg_slice = NULL;
81 static bool arg_private_network = false;
82 static bool arg_read_only = false;
83 static bool arg_boot = false;
84 static LinkJournal arg_link_journal = LINK_AUTO;
85 static uint64_t arg_retain =
86         (1ULL << CAP_CHOWN) |
87         (1ULL << CAP_DAC_OVERRIDE) |
88         (1ULL << CAP_DAC_READ_SEARCH) |
89         (1ULL << CAP_FOWNER) |
90         (1ULL << CAP_FSETID) |
91         (1ULL << CAP_IPC_OWNER) |
92         (1ULL << CAP_KILL) |
93         (1ULL << CAP_LEASE) |
94         (1ULL << CAP_LINUX_IMMUTABLE) |
95         (1ULL << CAP_NET_BIND_SERVICE) |
96         (1ULL << CAP_NET_BROADCAST) |
97         (1ULL << CAP_NET_RAW) |
98         (1ULL << CAP_SETGID) |
99         (1ULL << CAP_SETFCAP) |
100         (1ULL << CAP_SETPCAP) |
101         (1ULL << CAP_SETUID) |
102         (1ULL << CAP_SYS_ADMIN) |
103         (1ULL << CAP_SYS_CHROOT) |
104         (1ULL << CAP_SYS_NICE) |
105         (1ULL << CAP_SYS_PTRACE) |
106         (1ULL << CAP_SYS_TTY_CONFIG) |
107         (1ULL << CAP_SYS_RESOURCE) |
108         (1ULL << CAP_SYS_BOOT) |
109         (1ULL << CAP_AUDIT_WRITE) |
110         (1ULL << CAP_AUDIT_CONTROL) |
111         (1ULL << CAP_MKNOD);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
114 static char **arg_setenv = NULL;
115
116 static int help(void) {
117
118         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120                "  -h --help                Show this help\n"
121                "     --version             Print version string\n"
122                "  -D --directory=NAME      Root directory for the container\n"
123                "  -b --boot                Boot up full system (i.e. invoke init)\n"
124                "  -u --user=USER           Run the command under specified user or uid\n"
125                "     --uuid=UUID           Set a specific machine UUID for the container\n"
126                "  -M --machine=NAME        Set the machine name for the container\n"
127                "  -S --slice=SLICE         Place the container in the specified slice\n"
128                "     --private-network     Disable network in container\n"
129                "     --read-only           Mount the root directory read-only\n"
130                "     --capability=CAP      In addition to the default, retain specified\n"
131                "                           capability\n"
132                "     --drop-capability=CAP Drop the specified capability from the default set\n"
133                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
134                "  -j                       Equivalent to --link-journal=host\n"
135                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
136                "                           the container\n"
137                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
138                "     --setenv=NAME=VALUE   Pass an environment variable to PID 1\n",
139                program_invocation_short_name);
140
141         return 0;
142 }
143
144 static int parse_argv(int argc, char *argv[]) {
145
146         enum {
147                 ARG_VERSION = 0x100,
148                 ARG_PRIVATE_NETWORK,
149                 ARG_UUID,
150                 ARG_READ_ONLY,
151                 ARG_CAPABILITY,
152                 ARG_DROP_CAPABILITY,
153                 ARG_LINK_JOURNAL,
154                 ARG_BIND,
155                 ARG_BIND_RO,
156                 ARG_SETENV,
157         };
158
159         static const struct option options[] = {
160                 { "help",            no_argument,       NULL, 'h'                 },
161                 { "version",         no_argument,       NULL, ARG_VERSION         },
162                 { "directory",       required_argument, NULL, 'D'                 },
163                 { "user",            required_argument, NULL, 'u'                 },
164                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
165                 { "boot",            no_argument,       NULL, 'b'                 },
166                 { "uuid",            required_argument, NULL, ARG_UUID            },
167                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
168                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
169                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
170                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
171                 { "bind",            required_argument, NULL, ARG_BIND            },
172                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
173                 { "machine",         required_argument, NULL, 'M'                 },
174                 { "slice",           required_argument, NULL, 'S'                 },
175                 { "setenv",          required_argument, NULL, ARG_SETENV          },
176                 {}
177         };
178
179         int c, r;
180
181         assert(argc >= 0);
182         assert(argv);
183
184         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
185
186                 switch (c) {
187
188                 case 'h':
189                         return help();
190
191                 case ARG_VERSION:
192                         puts(PACKAGE_STRING);
193                         puts(SYSTEMD_FEATURES);
194                         return 0;
195
196                 case 'D':
197                         free(arg_directory);
198                         arg_directory = canonicalize_file_name(optarg);
199                         if (!arg_directory) {
200                                 log_error("Invalid root directory: %m");
201                                 return -ENOMEM;
202                         }
203
204                         break;
205
206                 case 'u':
207                         free(arg_user);
208                         arg_user = strdup(optarg);
209                         if (!arg_user)
210                                 return log_oom();
211
212                         break;
213
214                 case ARG_PRIVATE_NETWORK:
215                         arg_private_network = true;
216                         break;
217
218                 case 'b':
219                         arg_boot = true;
220                         break;
221
222                 case ARG_UUID:
223                         r = sd_id128_from_string(optarg, &arg_uuid);
224                         if (r < 0) {
225                                 log_error("Invalid UUID: %s", optarg);
226                                 return r;
227                         }
228                         break;
229
230                 case 'S':
231                         arg_slice = strdup(optarg);
232                         if (!arg_slice)
233                                 return log_oom();
234
235                         break;
236
237                 case 'M':
238                         if (!hostname_is_valid(optarg)) {
239                                 log_error("Invalid machine name: %s", optarg);
240                                 return -EINVAL;
241                         }
242
243                         free(arg_machine);
244                         arg_machine = strdup(optarg);
245                         if (!arg_machine)
246                                 return log_oom();
247
248                         break;
249
250                 case ARG_READ_ONLY:
251                         arg_read_only = true;
252                         break;
253
254                 case ARG_CAPABILITY:
255                 case ARG_DROP_CAPABILITY: {
256                         char *state, *word;
257                         size_t length;
258
259                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
260                                 cap_value_t cap;
261                                 char *t;
262
263                                 t = strndup(word, length);
264                                 if (!t)
265                                         return log_oom();
266
267                                 if (cap_from_name(t, &cap) < 0) {
268                                         log_error("Failed to parse capability %s.", t);
269                                         free(t);
270                                         return -EINVAL;
271                                 }
272
273                                 free(t);
274
275                                 if (c == ARG_CAPABILITY)
276                                         arg_retain |= 1ULL << (uint64_t) cap;
277                                 else
278                                         arg_retain &= ~(1ULL << (uint64_t) cap);
279                         }
280
281                         break;
282                 }
283
284                 case 'j':
285                         arg_link_journal = LINK_GUEST;
286                         break;
287
288                 case ARG_LINK_JOURNAL:
289                         if (streq(optarg, "auto"))
290                                 arg_link_journal = LINK_AUTO;
291                         else if (streq(optarg, "no"))
292                                 arg_link_journal = LINK_NO;
293                         else if (streq(optarg, "guest"))
294                                 arg_link_journal = LINK_GUEST;
295                         else if (streq(optarg, "host"))
296                                 arg_link_journal = LINK_HOST;
297                         else {
298                                 log_error("Failed to parse link journal mode %s", optarg);
299                                 return -EINVAL;
300                         }
301
302                         break;
303
304                 case ARG_BIND:
305                 case ARG_BIND_RO: {
306                         _cleanup_free_ char *a = NULL, *b = NULL;
307                         char *e;
308                         char ***x;
309
310                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
311
312                         e = strchr(optarg, ':');
313                         if (e) {
314                                 a = strndup(optarg, e - optarg);
315                                 b = strdup(e + 1);
316                         } else {
317                                 a = strdup(optarg);
318                                 b = strdup(optarg);
319                         }
320
321                         if (!a || !b)
322                                 return log_oom();
323
324                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
325                                 log_error("Invalid bind mount specification: %s", optarg);
326                                 return -EINVAL;
327                         }
328
329                         r = strv_extend(x, a);
330                         if (r < 0)
331                                 return log_oom();
332
333                         r = strv_extend(x, b);
334                         if (r < 0)
335                                 return log_oom();
336
337                         break;
338                 }
339
340                 case ARG_SETENV: {
341                         char **n;
342
343                         if (!env_assignment_is_valid(optarg)) {
344                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
345                                 return -EINVAL;
346                         }
347
348                         n = strv_env_set(arg_setenv, optarg);
349                         if (!n)
350                                 return log_oom();
351
352                         strv_free(arg_setenv);
353                         arg_setenv = n;
354                         break;
355                 }
356
357                 case '?':
358                         return -EINVAL;
359
360                 default:
361                         assert_not_reached("Unhandled option");
362                 }
363         }
364
365         return 1;
366 }
367
368 static int mount_all(const char *dest) {
369
370         typedef struct MountPoint {
371                 const char *what;
372                 const char *where;
373                 const char *type;
374                 const char *options;
375                 unsigned long flags;
376                 bool fatal;
377         } MountPoint;
378
379         static const MountPoint mount_table[] = {
380                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
381                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
382                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
383                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
384                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
385                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
386                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
387                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
388 #ifdef HAVE_SELINUX
389                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
390                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
391 #endif
392         };
393
394         unsigned k;
395         int r = 0;
396
397         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
398                 _cleanup_free_ char *where = NULL;
399                 int t;
400
401                 where = strjoin(dest, "/", mount_table[k].where, NULL);
402                 if (!where)
403                         return log_oom();
404
405                 t = path_is_mount_point(where, true);
406                 if (t < 0) {
407                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
408
409                         if (r == 0)
410                                 r = t;
411
412                         continue;
413                 }
414
415                 /* Skip this entry if it is not a remount. */
416                 if (mount_table[k].what && t > 0)
417                         continue;
418
419                 mkdir_p(where, 0755);
420
421                 if (mount(mount_table[k].what,
422                           where,
423                           mount_table[k].type,
424                           mount_table[k].flags,
425                           mount_table[k].options) < 0 &&
426                     mount_table[k].fatal) {
427
428                         log_error("mount(%s) failed: %m", where);
429
430                         if (r == 0)
431                                 r = -errno;
432                 }
433         }
434
435         return r;
436 }
437
438 static int mount_binds(const char *dest, char **l, unsigned long flags) {
439         char **x, **y;
440
441         STRV_FOREACH_PAIR(x, y, l) {
442                 char *where;
443                 struct stat source_st, dest_st;
444                 int r;
445
446                 if (stat(*x, &source_st) < 0) {
447                         log_error("failed to stat %s: %m", *x);
448                         return -errno;
449                 }
450
451                 where = strappenda(dest, *y);
452                 r = stat(where, &dest_st);
453                 if (r == 0) {
454                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
455                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
456                                                 *x, where);
457                                 return -EINVAL;
458                         }
459                 } else if (errno == ENOENT) {
460                         r = mkdir_parents_label(where, 0755);
461                         if (r < 0) {
462                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
463                                 return r;
464                         }
465                 } else {
466                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
467                         return -errno;
468                 }
469                 /* Create the mount point, but be conservative -- refuse to create block
470                 * and char devices. */
471                 if (S_ISDIR(source_st.st_mode))
472                         mkdir_label(where, 0755);
473                 else if (S_ISFIFO(source_st.st_mode))
474                         mkfifo(where, 0644);
475                 else if (S_ISSOCK(source_st.st_mode))
476                         mknod(where, 0644 | S_IFSOCK, 0);
477                 else if (S_ISREG(source_st.st_mode))
478                         touch(where);
479                 else {
480                         log_error("Refusing to create mountpoint for file: %s", *x);
481                         return -ENOTSUP;
482                 }
483
484                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
485                         log_error("mount(%s) failed: %m", where);
486                         return -errno;
487                 }
488
489                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
490                         log_error("mount(%s) failed: %m", where);
491                         return -errno;
492                 }
493         }
494
495         return 0;
496 }
497
498 static int setup_timezone(const char *dest) {
499         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
500         char *z, *y;
501         int r;
502
503         assert(dest);
504
505         /* Fix the timezone, if possible */
506         r = readlink_malloc("/etc/localtime", &p);
507         if (r < 0) {
508                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
509                 return 0;
510         }
511
512         z = path_startswith(p, "../usr/share/zoneinfo/");
513         if (!z)
514                 z = path_startswith(p, "/usr/share/zoneinfo/");
515         if (!z) {
516                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
517                 return 0;
518         }
519
520         where = strappend(dest, "/etc/localtime");
521         if (!where)
522                 return log_oom();
523
524         r = readlink_malloc(where, &q);
525         if (r >= 0) {
526                 y = path_startswith(q, "../usr/share/zoneinfo/");
527                 if (!y)
528                         y = path_startswith(q, "/usr/share/zoneinfo/");
529
530
531                 /* Already pointing to the right place? Then do nothing .. */
532                 if (y && streq(y, z))
533                         return 0;
534         }
535
536         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
537         if (!check)
538                 return log_oom();
539
540         if (access(check, F_OK) < 0) {
541                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
542                 return 0;
543         }
544
545         what = strappend("../usr/share/zoneinfo/", z);
546         if (!what)
547                 return log_oom();
548
549         unlink(where);
550         if (symlink(what, where) < 0) {
551                 log_error("Failed to correct timezone of container: %m");
552                 return 0;
553         }
554
555         return 0;
556 }
557
558 static int setup_resolv_conf(const char *dest) {
559         char _cleanup_free_ *where = NULL;
560
561         assert(dest);
562
563         if (arg_private_network)
564                 return 0;
565
566         /* Fix resolv.conf, if possible */
567         where = strappend(dest, "/etc/resolv.conf");
568         if (!where)
569                 return log_oom();
570
571         /* We don't really care for the results of this really. If it
572          * fails, it fails, but meh... */
573         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
574
575         return 0;
576 }
577
578 static int setup_boot_id(const char *dest) {
579         _cleanup_free_ char *from = NULL, *to = NULL;
580         sd_id128_t rnd;
581         char as_uuid[37];
582         int r;
583
584         assert(dest);
585
586         /* Generate a new randomized boot ID, so that each boot-up of
587          * the container gets a new one */
588
589         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
590         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
591         if (!from || !to)
592                 return log_oom();
593
594         r = sd_id128_randomize(&rnd);
595         if (r < 0) {
596                 log_error("Failed to generate random boot id: %s", strerror(-r));
597                 return r;
598         }
599
600         snprintf(as_uuid, sizeof(as_uuid),
601                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
602                  SD_ID128_FORMAT_VAL(rnd));
603         char_array_0(as_uuid);
604
605         r = write_string_file(from, as_uuid);
606         if (r < 0) {
607                 log_error("Failed to write boot id: %s", strerror(-r));
608                 return r;
609         }
610
611         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
612                 log_error("Failed to bind mount boot id: %m");
613                 r = -errno;
614         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
615                 log_warning("Failed to make boot id read-only: %m");
616
617         unlink(from);
618         return r;
619 }
620
621 static int copy_devnodes(const char *dest) {
622
623         static const char devnodes[] =
624                 "null\0"
625                 "zero\0"
626                 "full\0"
627                 "random\0"
628                 "urandom\0"
629                 "tty\0";
630
631         const char *d;
632         int r = 0;
633         _cleanup_umask_ mode_t u;
634
635         assert(dest);
636
637         u = umask(0000);
638
639         NULSTR_FOREACH(d, devnodes) {
640                 _cleanup_free_ char *from = NULL, *to = NULL;
641                 struct stat st;
642
643                 from = strappend("/dev/", d);
644                 to = strjoin(dest, "/dev/", d, NULL);
645                 if (!from || !to)
646                         return log_oom();
647
648                 if (stat(from, &st) < 0) {
649
650                         if (errno != ENOENT) {
651                                 log_error("Failed to stat %s: %m", from);
652                                 return -errno;
653                         }
654
655                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
656
657                         log_error("%s is not a char or block device, cannot copy", from);
658                         return -EIO;
659
660                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
661
662                         log_error("mknod(%s) failed: %m", dest);
663                         return  -errno;
664                 }
665         }
666
667         return r;
668 }
669
670 static int setup_ptmx(const char *dest) {
671         _cleanup_free_ char *p = NULL;
672
673         p = strappend(dest, "/dev/ptmx");
674         if (!p)
675                 return log_oom();
676
677         if (symlink("pts/ptmx", p) < 0) {
678                 log_error("Failed to create /dev/ptmx symlink: %m");
679                 return -errno;
680         }
681
682         return 0;
683 }
684
685 static int setup_dev_console(const char *dest, const char *console) {
686         struct stat st;
687         _cleanup_free_ char *to = NULL;
688         int r;
689         _cleanup_umask_ mode_t u;
690
691         assert(dest);
692         assert(console);
693
694         u = umask(0000);
695
696         if (stat(console, &st) < 0) {
697                 log_error("Failed to stat %s: %m", console);
698                 return -errno;
699
700         } else if (!S_ISCHR(st.st_mode)) {
701                 log_error("/dev/console is not a char device");
702                 return -EIO;
703         }
704
705         r = chmod_and_chown(console, 0600, 0, 0);
706         if (r < 0) {
707                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
708                 return r;
709         }
710
711         if (asprintf(&to, "%s/dev/console", dest) < 0)
712                 return log_oom();
713
714         /* We need to bind mount the right tty to /dev/console since
715          * ptys can only exist on pts file systems. To have something
716          * to bind mount things on we create a device node first, that
717          * has the right major/minor (note that the major minor
718          * doesn't actually matter here, since we mount it over
719          * anyway). */
720
721         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
722                 log_error("mknod() for /dev/console failed: %m");
723                 return -errno;
724         }
725
726         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
727                 log_error("Bind mount for /dev/console failed: %m");
728                 return -errno;
729         }
730
731         return 0;
732 }
733
734 static int setup_kmsg(const char *dest, int kmsg_socket) {
735         _cleanup_free_ char *from = NULL, *to = NULL;
736         int r, fd, k;
737         _cleanup_umask_ mode_t u;
738         union {
739                 struct cmsghdr cmsghdr;
740                 uint8_t buf[CMSG_SPACE(sizeof(int))];
741         } control = {};
742         struct msghdr mh = {
743                 .msg_control = &control,
744                 .msg_controllen = sizeof(control),
745         };
746         struct cmsghdr *cmsg;
747
748         assert(dest);
749         assert(kmsg_socket >= 0);
750
751         u = umask(0000);
752
753         /* We create the kmsg FIFO as /dev/kmsg, but immediately
754          * delete it after bind mounting it to /proc/kmsg. While FIFOs
755          * on the reading side behave very similar to /proc/kmsg,
756          * their writing side behaves differently from /dev/kmsg in
757          * that writing blocks when nothing is reading. In order to
758          * avoid any problems with containers deadlocking due to this
759          * we simply make /dev/kmsg unavailable to the container. */
760         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
761             asprintf(&to, "%s/proc/kmsg", dest) < 0)
762                 return log_oom();
763
764         if (mkfifo(from, 0600) < 0) {
765                 log_error("mkfifo() for /dev/kmsg failed: %m");
766                 return -errno;
767         }
768
769         r = chmod_and_chown(from, 0600, 0, 0);
770         if (r < 0) {
771                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
772                 return r;
773         }
774
775         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
776                 log_error("Bind mount for /proc/kmsg failed: %m");
777                 return -errno;
778         }
779
780         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
781         if (fd < 0) {
782                 log_error("Failed to open fifo: %m");
783                 return -errno;
784         }
785
786         cmsg = CMSG_FIRSTHDR(&mh);
787         cmsg->cmsg_level = SOL_SOCKET;
788         cmsg->cmsg_type = SCM_RIGHTS;
789         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
790         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
791
792         mh.msg_controllen = cmsg->cmsg_len;
793
794         /* Store away the fd in the socket, so that it stays open as
795          * long as we run the child */
796         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
797         close_nointr_nofail(fd);
798
799         if (k < 0) {
800                 log_error("Failed to send FIFO fd: %m");
801                 return -errno;
802         }
803
804         /* And now make the FIFO unavailable as /dev/kmsg... */
805         unlink(from);
806         return 0;
807 }
808
809 static int setup_hostname(void) {
810
811         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
812                 return -errno;
813
814         return 0;
815 }
816
817 static int setup_journal(const char *directory) {
818         sd_id128_t machine_id, this_id;
819         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
820         char *id;
821         int r;
822
823         p = strappend(directory, "/etc/machine-id");
824         if (!p)
825                 return log_oom();
826
827         r = read_one_line_file(p, &b);
828         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
829                 return 0;
830         else if (r < 0) {
831                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
832                 return r;
833         }
834
835         id = strstrip(b);
836         if (isempty(id) && arg_link_journal == LINK_AUTO)
837                 return 0;
838
839         /* Verify validity */
840         r = sd_id128_from_string(id, &machine_id);
841         if (r < 0) {
842                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
843                 return r;
844         }
845
846         r = sd_id128_get_machine(&this_id);
847         if (r < 0) {
848                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
849                 return r;
850         }
851
852         if (sd_id128_equal(machine_id, this_id)) {
853                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
854                          "Host and machine ids are equal (%s): refusing to link journals", id);
855                 if (arg_link_journal == LINK_AUTO)
856                         return 0;
857                 return
858                         -EEXIST;
859         }
860
861         if (arg_link_journal == LINK_NO)
862                 return 0;
863
864         free(p);
865         p = strappend("/var/log/journal/", id);
866         q = strjoin(directory, "/var/log/journal/", id, NULL);
867         if (!p || !q)
868                 return log_oom();
869
870         if (path_is_mount_point(p, false) > 0) {
871                 if (arg_link_journal != LINK_AUTO) {
872                         log_error("%s: already a mount point, refusing to use for journal", p);
873                         return -EEXIST;
874                 }
875
876                 return 0;
877         }
878
879         if (path_is_mount_point(q, false) > 0) {
880                 if (arg_link_journal != LINK_AUTO) {
881                         log_error("%s: already a mount point, refusing to use for journal", q);
882                         return -EEXIST;
883                 }
884
885                 return 0;
886         }
887
888         r = readlink_and_make_absolute(p, &d);
889         if (r >= 0) {
890                 if ((arg_link_journal == LINK_GUEST ||
891                      arg_link_journal == LINK_AUTO) &&
892                     path_equal(d, q)) {
893
894                         r = mkdir_p(q, 0755);
895                         if (r < 0)
896                                 log_warning("failed to create directory %s: %m", q);
897                         return 0;
898                 }
899
900                 if (unlink(p) < 0) {
901                         log_error("Failed to remove symlink %s: %m", p);
902                         return -errno;
903                 }
904         } else if (r == -EINVAL) {
905
906                 if (arg_link_journal == LINK_GUEST &&
907                     rmdir(p) < 0) {
908
909                         if (errno == ENOTDIR) {
910                                 log_error("%s already exists and is neither a symlink nor a directory", p);
911                                 return r;
912                         } else {
913                                 log_error("Failed to remove %s: %m", p);
914                                 return -errno;
915                         }
916                 }
917         } else if (r != -ENOENT) {
918                 log_error("readlink(%s) failed: %m", p);
919                 return r;
920         }
921
922         if (arg_link_journal == LINK_GUEST) {
923
924                 if (symlink(q, p) < 0) {
925                         log_error("Failed to symlink %s to %s: %m", q, p);
926                         return -errno;
927                 }
928
929                 r = mkdir_p(q, 0755);
930                 if (r < 0)
931                         log_warning("failed to create directory %s: %m", q);
932                 return 0;
933         }
934
935         if (arg_link_journal == LINK_HOST) {
936                 r = mkdir_p(p, 0755);
937                 if (r < 0) {
938                         log_error("Failed to create %s: %m", p);
939                         return r;
940                 }
941
942         } else if (access(p, F_OK) < 0)
943                 return 0;
944
945         if (dir_is_empty(q) == 0) {
946                 log_error("%s not empty.", q);
947                 return -ENOTEMPTY;
948         }
949
950         r = mkdir_p(q, 0755);
951         if (r < 0) {
952                 log_error("Failed to create %s: %m", q);
953                 return r;
954         }
955
956         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
957                 log_error("Failed to bind mount journal from host into guest: %m");
958                 return -errno;
959         }
960
961         return 0;
962 }
963
964 static int setup_kdbus(const char *dest, const char *path) {
965         const char *p;
966
967         if (!path)
968                 return 0;
969
970         p = strappenda(dest, "/dev/kdbus");
971         if (mkdir(p, 0755) < 0) {
972                 log_error("Failed to create kdbus path: %m");
973                 return  -errno;
974         }
975
976         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
977                 log_error("Failed to mount kdbus domain path: %m");
978                 return -errno;
979         }
980
981         return 0;
982 }
983
984 static int drop_capabilities(void) {
985         return capability_bounding_set_drop(~arg_retain, false);
986 }
987
988 static int register_machine(pid_t pid) {
989         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
990         _cleanup_bus_unref_ sd_bus *bus = NULL;
991         int r;
992
993         r = sd_bus_open_system(&bus);
994         if (r < 0) {
995                 log_error("Failed to open system bus: %s", strerror(-r));
996                 return r;
997         }
998
999         r = sd_bus_call_method(
1000                         bus,
1001                         "org.freedesktop.machine1",
1002                         "/org/freedesktop/machine1",
1003                         "org.freedesktop.machine1.Manager",
1004                         "CreateMachine",
1005                         &error,
1006                         NULL,
1007                         "sayssusa(sv)",
1008                         arg_machine,
1009                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1010                         "nspawn",
1011                         "container",
1012                         (uint32_t) pid,
1013                         strempty(arg_directory),
1014                         !isempty(arg_slice), "Slice", "s", arg_slice);
1015         if (r < 0) {
1016                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1017                 return r;
1018         }
1019
1020         return 0;
1021 }
1022
1023 static int terminate_machine(pid_t pid) {
1024         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1025         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1026         _cleanup_bus_unref_ sd_bus *bus = NULL;
1027         const char *path;
1028         int r;
1029
1030         r = sd_bus_default_system(&bus);
1031         if (r < 0) {
1032                 log_error("Failed to open system bus: %s", strerror(-r));
1033                 return r;
1034         }
1035
1036         r = sd_bus_call_method(
1037                         bus,
1038                         "org.freedesktop.machine1",
1039                         "/org/freedesktop/machine1",
1040                         "org.freedesktop.machine1.Manager",
1041                         "GetMachineByPID",
1042                         &error,
1043                         &reply,
1044                         "u",
1045                         (uint32_t) pid);
1046         if (r < 0) {
1047                 /* Note that the machine might already have been
1048                  * cleaned up automatically, hence don't consider it a
1049                  * failure if we cannot get the machine object. */
1050                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1051                 return 0;
1052         }
1053
1054         r = sd_bus_message_read(reply, "o", &path);
1055         if (r < 0)
1056                 return bus_log_parse_error(r);
1057
1058         r = sd_bus_call_method(
1059                         bus,
1060                         "org.freedesktop.machine1",
1061                         path,
1062                         "org.freedesktop.machine1.Machine",
1063                         "Terminate",
1064                         &error,
1065                         NULL,
1066                         NULL);
1067         if (r < 0) {
1068                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1069                 return 0;
1070         }
1071
1072         return 0;
1073 }
1074
1075 static bool audit_enabled(void) {
1076         int fd;
1077
1078         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1079         if (fd >= 0) {
1080                 close_nointr_nofail(fd);
1081                 return true;
1082         }
1083         return false;
1084 }
1085
1086 int main(int argc, char *argv[]) {
1087         pid_t pid = 0;
1088         int r = EXIT_FAILURE, k;
1089         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1090         int n_fd_passed;
1091         const char *console = NULL;
1092         sigset_t mask;
1093         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1094         _cleanup_fdset_free_ FDSet *fds = NULL;
1095         _cleanup_free_ char *kdbus_domain = NULL;
1096         const char *ns;
1097
1098         log_parse_environment();
1099         log_open();
1100
1101         k = parse_argv(argc, argv);
1102         if (k < 0)
1103                 goto finish;
1104         else if (k == 0) {
1105                 r = EXIT_SUCCESS;
1106                 goto finish;
1107         }
1108
1109         if (arg_directory) {
1110                 char *p;
1111
1112                 p = path_make_absolute_cwd(arg_directory);
1113                 free(arg_directory);
1114                 arg_directory = p;
1115         } else
1116                 arg_directory = get_current_dir_name();
1117
1118         if (!arg_directory) {
1119                 log_error("Failed to determine path, please use -D.");
1120                 goto finish;
1121         }
1122
1123         path_kill_slashes(arg_directory);
1124
1125         if (!arg_machine) {
1126                 arg_machine = strdup(basename(arg_directory));
1127                 if (!arg_machine) {
1128                         log_oom();
1129                         goto finish;
1130                 }
1131
1132                 hostname_cleanup(arg_machine, false);
1133                 if (isempty(arg_machine)) {
1134                         log_error("Failed to determine machine name automatically, please use -M.");
1135                         goto finish;
1136                 }
1137         }
1138
1139         if (geteuid() != 0) {
1140                 log_error("Need to be root.");
1141                 goto finish;
1142         }
1143
1144         if (sd_booted() <= 0) {
1145                 log_error("Not running on a systemd system.");
1146                 goto finish;
1147         }
1148
1149         if (arg_boot && audit_enabled()) {
1150                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1151                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1152                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1153                 sleep(5);
1154         }
1155
1156         if (path_equal(arg_directory, "/")) {
1157                 log_error("Spawning container on root directory not supported.");
1158                 goto finish;
1159         }
1160
1161         if (path_is_os_tree(arg_directory) <= 0) {
1162                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1163                 goto finish;
1164         }
1165
1166         log_close();
1167         n_fd_passed = sd_listen_fds(false);
1168         if (n_fd_passed > 0) {
1169                 k = fdset_new_listen_fds(&fds, false);
1170                 if (k < 0) {
1171                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1172                         goto finish;
1173                 }
1174         }
1175         fdset_close_others(fds);
1176         log_open();
1177
1178         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1179         if (master < 0) {
1180                 log_error("Failed to acquire pseudo tty: %m");
1181                 goto finish;
1182         }
1183
1184         console = ptsname(master);
1185         if (!console) {
1186                 log_error("Failed to determine tty name: %m");
1187                 goto finish;
1188         }
1189
1190         log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1191
1192         if (unlockpt(master) < 0) {
1193                 log_error("Failed to unlock tty: %m");
1194                 goto finish;
1195         }
1196
1197         ns = strappenda("machine-", arg_machine);
1198         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1199         if (r < 0)
1200                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1201         else
1202                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1203
1204         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1205                 log_error("Failed to create kmsg socket pair: %m");
1206                 goto finish;
1207         }
1208
1209         sd_notify(0, "READY=1");
1210
1211         assert_se(sigemptyset(&mask) == 0);
1212         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1213         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1214
1215         for (;;) {
1216                 siginfo_t status;
1217
1218                 sync_fd = eventfd(0, EFD_CLOEXEC);
1219                 if (sync_fd < 0) {
1220                         log_error("Failed to create event fd: %m");
1221                         goto finish;
1222                 }
1223
1224                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1225                 if (pid < 0) {
1226                         if (errno == EINVAL)
1227                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1228                         else
1229                                 log_error("clone() failed: %m");
1230
1231                         goto finish;
1232                 }
1233
1234                 if (pid == 0) {
1235                         /* child */
1236                         const char *home = NULL;
1237                         uid_t uid = (uid_t) -1;
1238                         gid_t gid = (gid_t) -1;
1239                         unsigned n_env = 2;
1240                         const char *envp[] = {
1241                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1242                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1243                                 NULL, /* TERM */
1244                                 NULL, /* HOME */
1245                                 NULL, /* USER */
1246                                 NULL, /* LOGNAME */
1247                                 NULL, /* container_uuid */
1248                                 NULL, /* LISTEN_FDS */
1249                                 NULL, /* LISTEN_PID */
1250                                 NULL
1251                         };
1252                         char **env_use;
1253                         eventfd_t x;
1254
1255                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1256                         if (envp[n_env])
1257                                 n_env ++;
1258
1259                         close_nointr_nofail(master);
1260                         master = -1;
1261
1262                         close_nointr(STDIN_FILENO);
1263                         close_nointr(STDOUT_FILENO);
1264                         close_nointr(STDERR_FILENO);
1265
1266                         close_nointr_nofail(kmsg_socket_pair[0]);
1267                         kmsg_socket_pair[0] = -1;
1268
1269                         reset_all_signal_handlers();
1270
1271                         assert_se(sigemptyset(&mask) == 0);
1272                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1273
1274                         k = open_terminal(console, O_RDWR);
1275                         if (k != STDIN_FILENO) {
1276                                 if (k >= 0) {
1277                                         close_nointr_nofail(k);
1278                                         k = -EINVAL;
1279                                 }
1280
1281                                 log_error("Failed to open console: %s", strerror(-k));
1282                                 goto child_fail;
1283                         }
1284
1285                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1286                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1287                                 log_error("Failed to duplicate console: %m");
1288                                 goto child_fail;
1289                         }
1290
1291                         if (setsid() < 0) {
1292                                 log_error("setsid() failed: %m");
1293                                 goto child_fail;
1294                         }
1295
1296                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1297                                 log_error("PR_SET_PDEATHSIG failed: %m");
1298                                 goto child_fail;
1299                         }
1300
1301                         /* Mark everything as slave, so that we still
1302                          * receive mounts from the real root, but don't
1303                          * propagate mounts to the real root. */
1304                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1305                                 log_error("MS_SLAVE|MS_REC failed: %m");
1306                                 goto child_fail;
1307                         }
1308
1309                         /* Turn directory into bind mount */
1310                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1311                                 log_error("Failed to make bind mount.");
1312                                 goto child_fail;
1313                         }
1314
1315                         if (arg_read_only)
1316                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1317                                         log_error("Failed to make read-only.");
1318                                         goto child_fail;
1319                                 }
1320
1321                         if (mount_all(arg_directory) < 0)
1322                                 goto child_fail;
1323
1324                         if (copy_devnodes(arg_directory) < 0)
1325                                 goto child_fail;
1326
1327                         if (setup_ptmx(arg_directory) < 0)
1328                                 goto child_fail;
1329
1330                         dev_setup(arg_directory);
1331
1332                         if (setup_dev_console(arg_directory, console) < 0)
1333                                 goto child_fail;
1334
1335                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1336                                 goto child_fail;
1337
1338                         close_nointr_nofail(kmsg_socket_pair[1]);
1339                         kmsg_socket_pair[1] = -1;
1340
1341                         if (setup_boot_id(arg_directory) < 0)
1342                                 goto child_fail;
1343
1344                         if (setup_timezone(arg_directory) < 0)
1345                                 goto child_fail;
1346
1347                         if (setup_resolv_conf(arg_directory) < 0)
1348                                 goto child_fail;
1349
1350                         if (setup_journal(arg_directory) < 0)
1351                                 goto child_fail;
1352
1353                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1354                                 goto child_fail;
1355
1356                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1357                                 goto child_fail;
1358
1359                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1360                                 goto child_fail;
1361
1362                         if (chdir(arg_directory) < 0) {
1363                                 log_error("chdir(%s) failed: %m", arg_directory);
1364                                 goto child_fail;
1365                         }
1366
1367                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1368                                 log_error("mount(MS_MOVE) failed: %m");
1369                                 goto child_fail;
1370                         }
1371
1372                         if (chroot(".") < 0) {
1373                                 log_error("chroot() failed: %m");
1374                                 goto child_fail;
1375                         }
1376
1377                         if (chdir("/") < 0) {
1378                                 log_error("chdir() failed: %m");
1379                                 goto child_fail;
1380                         }
1381
1382                         umask(0022);
1383
1384                         loopback_setup();
1385
1386                         if (drop_capabilities() < 0) {
1387                                 log_error("drop_capabilities() failed: %m");
1388                                 goto child_fail;
1389                         }
1390
1391                         if (arg_user) {
1392
1393                                 /* Note that this resolves user names
1394                                  * inside the container, and hence
1395                                  * accesses the NSS modules from the
1396                                  * container and not the host. This is
1397                                  * a bit weird... */
1398
1399                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1400                                         log_error("get_user_creds() failed: %m");
1401                                         goto child_fail;
1402                                 }
1403
1404                                 if (mkdir_parents_label(home, 0775) < 0) {
1405                                         log_error("mkdir_parents_label() failed: %m");
1406                                         goto child_fail;
1407                                 }
1408
1409                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1410                                         log_error("mkdir_safe_label() failed: %m");
1411                                         goto child_fail;
1412                                 }
1413
1414                                 if (initgroups((const char*)arg_user, gid) < 0) {
1415                                         log_error("initgroups() failed: %m");
1416                                         goto child_fail;
1417                                 }
1418
1419                                 if (setresgid(gid, gid, gid) < 0) {
1420                                         log_error("setregid() failed: %m");
1421                                         goto child_fail;
1422                                 }
1423
1424                                 if (setresuid(uid, uid, uid) < 0) {
1425                                         log_error("setreuid() failed: %m");
1426                                         goto child_fail;
1427                                 }
1428                         } else {
1429                                 /* Reset everything fully to 0, just in case */
1430
1431                                 if (setgroups(0, NULL) < 0) {
1432                                         log_error("setgroups() failed: %m");
1433                                         goto child_fail;
1434                                 }
1435
1436                                 if (setresgid(0, 0, 0) < 0) {
1437                                         log_error("setregid() failed: %m");
1438                                         goto child_fail;
1439                                 }
1440
1441                                 if (setresuid(0, 0, 0) < 0) {
1442                                         log_error("setreuid() failed: %m");
1443                                         goto child_fail;
1444                                 }
1445                         }
1446
1447                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1448                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1449                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1450                                 log_oom();
1451                                 goto child_fail;
1452                         }
1453
1454                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1455                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1456                                         log_oom();
1457                                         goto child_fail;
1458                                 }
1459                         }
1460
1461                         if (fdset_size(fds) > 0) {
1462                                 k = fdset_cloexec(fds, false);
1463                                 if (k < 0) {
1464                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1465                                         goto child_fail;
1466                                 }
1467
1468                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1469                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1470                                         log_oom();
1471                                         goto child_fail;
1472                                 }
1473                         }
1474
1475                         setup_hostname();
1476
1477                         eventfd_read(sync_fd, &x);
1478                         close_nointr_nofail(sync_fd);
1479                         sync_fd = -1;
1480
1481                         if (!strv_isempty(arg_setenv)) {
1482                                 char **n;
1483
1484                                 n = strv_env_merge(2, envp, arg_setenv);
1485                                 if (!n) {
1486                                         log_oom();
1487                                         goto child_fail;
1488                                 }
1489
1490                                 env_use = n;
1491                         } else
1492                                 env_use = (char**) envp;
1493
1494                         if (arg_boot) {
1495                                 char **a;
1496                                 size_t l;
1497
1498                                 /* Automatically search for the init system */
1499
1500                                 l = 1 + argc - optind;
1501                                 a = newa(char*, l + 1);
1502                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1503
1504                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1505                                 execve(a[0], a, env_use);
1506
1507                                 a[0] = (char*) "/lib/systemd/systemd";
1508                                 execve(a[0], a, env_use);
1509
1510                                 a[0] = (char*) "/sbin/init";
1511                                 execve(a[0], a, env_use);
1512                         } else if (argc > optind)
1513                                 execvpe(argv[optind], argv + optind, env_use);
1514                         else {
1515                                 chdir(home ? home : "/root");
1516                                 execle("/bin/bash", "-bash", NULL, env_use);
1517                         }
1518
1519                         log_error("execv() failed: %m");
1520
1521                 child_fail:
1522                         _exit(EXIT_FAILURE);
1523                 }
1524
1525                 fdset_free(fds);
1526                 fds = NULL;
1527
1528                 r = register_machine(pid);
1529                 if (r < 0)
1530                         goto finish;
1531
1532                 eventfd_write(sync_fd, 1);
1533                 close_nointr_nofail(sync_fd);
1534                 sync_fd = -1;
1535
1536                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1537                 if (k < 0) {
1538                         r = EXIT_FAILURE;
1539                         break;
1540                 }
1541
1542                 putc('\n', stdout);
1543
1544                 /* Kill if it is not dead yet anyway */
1545                 terminate_machine(pid);
1546
1547                 /* Redundant, but better safe than sorry */
1548                 kill(pid, SIGKILL);
1549
1550                 k = wait_for_terminate(pid, &status);
1551                 pid = 0;
1552
1553                 if (k < 0) {
1554                         r = EXIT_FAILURE;
1555                         break;
1556                 }
1557
1558                 if (status.si_code == CLD_EXITED) {
1559                         r = status.si_status;
1560                         if (status.si_status != 0) {
1561                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1562                                 break;
1563                         }
1564
1565                         log_debug("Container %s exited successfully.", arg_machine);
1566                         break;
1567                 } else if (status.si_code == CLD_KILLED &&
1568                            status.si_status == SIGINT) {
1569                         log_info("Container %s has been shut down.", arg_machine);
1570                         r = 0;
1571                         break;
1572                 } else if (status.si_code == CLD_KILLED &&
1573                            status.si_status == SIGHUP) {
1574                         log_info("Container %s is being rebooted.", arg_machine);
1575                         continue;
1576                 } else if (status.si_code == CLD_KILLED ||
1577                            status.si_code == CLD_DUMPED) {
1578
1579                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1580                         r = EXIT_FAILURE;
1581                         break;
1582                 } else {
1583                         log_error("Container %s failed due to unknown reason.", arg_machine);
1584                         r = EXIT_FAILURE;
1585                         break;
1586                 }
1587         }
1588
1589 finish:
1590         if (pid > 0)
1591                 kill(pid, SIGKILL);
1592
1593         free(arg_directory);
1594         free(arg_machine);
1595         free(arg_setenv);
1596
1597         return r;
1598 }