chiark / gitweb /
241b4b5393a1f81988a277583db61fd722cd50b4
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43
44 #include "sd-daemon.h"
45 #include "sd-bus.h"
46 #include "sd-id128.h"
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60 #include "fileio.h"
61 #include "bus-util.h"
62 #include "bus-error.h"
63 #include "ptyfwd.h"
64 #include "bus-kernel.h"
65 #include "env-util.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72         LINK_NO,
73         LINK_AUTO,
74         LINK_HOST,
75         LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88         (1ULL << CAP_CHOWN) |
89         (1ULL << CAP_DAC_OVERRIDE) |
90         (1ULL << CAP_DAC_READ_SEARCH) |
91         (1ULL << CAP_FOWNER) |
92         (1ULL << CAP_FSETID) |
93         (1ULL << CAP_IPC_OWNER) |
94         (1ULL << CAP_KILL) |
95         (1ULL << CAP_LEASE) |
96         (1ULL << CAP_LINUX_IMMUTABLE) |
97         (1ULL << CAP_NET_BIND_SERVICE) |
98         (1ULL << CAP_NET_BROADCAST) |
99         (1ULL << CAP_NET_RAW) |
100         (1ULL << CAP_SETGID) |
101         (1ULL << CAP_SETFCAP) |
102         (1ULL << CAP_SETPCAP) |
103         (1ULL << CAP_SETUID) |
104         (1ULL << CAP_SYS_ADMIN) |
105         (1ULL << CAP_SYS_CHROOT) |
106         (1ULL << CAP_SYS_NICE) |
107         (1ULL << CAP_SYS_PTRACE) |
108         (1ULL << CAP_SYS_TTY_CONFIG) |
109         (1ULL << CAP_SYS_RESOURCE) |
110         (1ULL << CAP_SYS_BOOT) |
111         (1ULL << CAP_AUDIT_WRITE) |
112         (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115 static char **arg_setenv = NULL;
116
117 static int help(void) {
118
119         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
120                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
121                "  -h --help                Show this help\n"
122                "     --version             Print version string\n"
123                "  -D --directory=NAME      Root directory for the container\n"
124                "  -b --boot                Boot up full system (i.e. invoke init)\n"
125                "  -u --user=USER           Run the command under specified user or uid\n"
126                "     --uuid=UUID           Set a specific machine UUID for the container\n"
127                "  -M --machine=NAME        Set the machine name for the container\n"
128                "  -S --slice=SLICE         Place the container in the specified slice\n"
129                "     --private-network     Disable network in container\n"
130                "     --read-only           Mount the root directory read-only\n"
131                "     --capability=CAP      In addition to the default, retain specified\n"
132                "                           capability\n"
133                "     --drop-capability=CAP Drop the specified capability from the default set\n"
134                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
135                "  -j                       Equivalent to --link-journal=host\n"
136                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
137                "                           the container\n"
138                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
139                "     --setenv=NAME=VALUE   Pass an environment variable to PID 1\n",
140                program_invocation_short_name);
141
142         return 0;
143 }
144
145 static int parse_argv(int argc, char *argv[]) {
146
147         enum {
148                 ARG_VERSION = 0x100,
149                 ARG_PRIVATE_NETWORK,
150                 ARG_UUID,
151                 ARG_READ_ONLY,
152                 ARG_CAPABILITY,
153                 ARG_DROP_CAPABILITY,
154                 ARG_LINK_JOURNAL,
155                 ARG_BIND,
156                 ARG_BIND_RO,
157                 ARG_SETENV,
158         };
159
160         static const struct option options[] = {
161                 { "help",            no_argument,       NULL, 'h'                 },
162                 { "version",         no_argument,       NULL, ARG_VERSION         },
163                 { "directory",       required_argument, NULL, 'D'                 },
164                 { "user",            required_argument, NULL, 'u'                 },
165                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
166                 { "boot",            no_argument,       NULL, 'b'                 },
167                 { "uuid",            required_argument, NULL, ARG_UUID            },
168                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
169                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
170                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
171                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
172                 { "bind",            required_argument, NULL, ARG_BIND            },
173                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
174                 { "machine",         required_argument, NULL, 'M'                 },
175                 { "slice",           required_argument, NULL, 'S'                 },
176                 { "setenv",          required_argument, NULL, ARG_SETENV          },
177                 {}
178         };
179
180         int c, r;
181
182         assert(argc >= 0);
183         assert(argv);
184
185         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
186
187                 switch (c) {
188
189                 case 'h':
190                         return help();
191
192                 case ARG_VERSION:
193                         puts(PACKAGE_STRING);
194                         puts(SYSTEMD_FEATURES);
195                         return 0;
196
197                 case 'D':
198                         free(arg_directory);
199                         arg_directory = canonicalize_file_name(optarg);
200                         if (!arg_directory) {
201                                 log_error("Invalid root directory: %m");
202                                 return -ENOMEM;
203                         }
204
205                         break;
206
207                 case 'u':
208                         free(arg_user);
209                         arg_user = strdup(optarg);
210                         if (!arg_user)
211                                 return log_oom();
212
213                         break;
214
215                 case ARG_PRIVATE_NETWORK:
216                         arg_private_network = true;
217                         break;
218
219                 case 'b':
220                         arg_boot = true;
221                         break;
222
223                 case ARG_UUID:
224                         r = sd_id128_from_string(optarg, &arg_uuid);
225                         if (r < 0) {
226                                 log_error("Invalid UUID: %s", optarg);
227                                 return r;
228                         }
229                         break;
230
231                 case 'S':
232                         arg_slice = strdup(optarg);
233                         if (!arg_slice)
234                                 return log_oom();
235
236                         break;
237
238                 case 'M':
239                         if (!hostname_is_valid(optarg)) {
240                                 log_error("Invalid machine name: %s", optarg);
241                                 return -EINVAL;
242                         }
243
244                         free(arg_machine);
245                         arg_machine = strdup(optarg);
246                         if (!arg_machine)
247                                 return log_oom();
248
249                         break;
250
251                 case ARG_READ_ONLY:
252                         arg_read_only = true;
253                         break;
254
255                 case ARG_CAPABILITY:
256                 case ARG_DROP_CAPABILITY: {
257                         char *state, *word;
258                         size_t length;
259
260                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
261                                 cap_value_t cap;
262                                 char *t;
263
264                                 t = strndup(word, length);
265                                 if (!t)
266                                         return log_oom();
267
268                                 if (cap_from_name(t, &cap) < 0) {
269                                         log_error("Failed to parse capability %s.", t);
270                                         free(t);
271                                         return -EINVAL;
272                                 }
273
274                                 free(t);
275
276                                 if (c == ARG_CAPABILITY)
277                                         arg_retain |= 1ULL << (uint64_t) cap;
278                                 else
279                                         arg_retain &= ~(1ULL << (uint64_t) cap);
280                         }
281
282                         break;
283                 }
284
285                 case 'j':
286                         arg_link_journal = LINK_GUEST;
287                         break;
288
289                 case ARG_LINK_JOURNAL:
290                         if (streq(optarg, "auto"))
291                                 arg_link_journal = LINK_AUTO;
292                         else if (streq(optarg, "no"))
293                                 arg_link_journal = LINK_NO;
294                         else if (streq(optarg, "guest"))
295                                 arg_link_journal = LINK_GUEST;
296                         else if (streq(optarg, "host"))
297                                 arg_link_journal = LINK_HOST;
298                         else {
299                                 log_error("Failed to parse link journal mode %s", optarg);
300                                 return -EINVAL;
301                         }
302
303                         break;
304
305                 case ARG_BIND:
306                 case ARG_BIND_RO: {
307                         _cleanup_free_ char *a = NULL, *b = NULL;
308                         char *e;
309                         char ***x;
310
311                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
312
313                         e = strchr(optarg, ':');
314                         if (e) {
315                                 a = strndup(optarg, e - optarg);
316                                 b = strdup(e + 1);
317                         } else {
318                                 a = strdup(optarg);
319                                 b = strdup(optarg);
320                         }
321
322                         if (!a || !b)
323                                 return log_oom();
324
325                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
326                                 log_error("Invalid bind mount specification: %s", optarg);
327                                 return -EINVAL;
328                         }
329
330                         r = strv_extend(x, a);
331                         if (r < 0)
332                                 return log_oom();
333
334                         r = strv_extend(x, b);
335                         if (r < 0)
336                                 return log_oom();
337
338                         break;
339                 }
340
341                 case ARG_SETENV: {
342                         char **n;
343
344                         if (!env_assignment_is_valid(optarg)) {
345                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
346                                 return -EINVAL;
347                         }
348
349                         n = strv_env_set(arg_setenv, optarg);
350                         if (!n)
351                                 return log_oom();
352
353                         strv_free(arg_setenv);
354                         arg_setenv = n;
355                         break;
356                 }
357
358                 case '?':
359                         return -EINVAL;
360
361                 default:
362                         assert_not_reached("Unhandled option");
363                 }
364         }
365
366         return 1;
367 }
368
369 static int mount_all(const char *dest) {
370
371         typedef struct MountPoint {
372                 const char *what;
373                 const char *where;
374                 const char *type;
375                 const char *options;
376                 unsigned long flags;
377                 bool fatal;
378         } MountPoint;
379
380         static const MountPoint mount_table[] = {
381                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
382                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
383                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
384                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
385                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
386                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
387                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
388                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
389 #ifdef HAVE_SELINUX
390                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
391                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
392 #endif
393         };
394
395         unsigned k;
396         int r = 0;
397
398         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
399                 _cleanup_free_ char *where = NULL;
400                 int t;
401
402                 where = strjoin(dest, "/", mount_table[k].where, NULL);
403                 if (!where)
404                         return log_oom();
405
406                 t = path_is_mount_point(where, true);
407                 if (t < 0) {
408                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
409
410                         if (r == 0)
411                                 r = t;
412
413                         continue;
414                 }
415
416                 /* Skip this entry if it is not a remount. */
417                 if (mount_table[k].what && t > 0)
418                         continue;
419
420                 mkdir_p(where, 0755);
421
422                 if (mount(mount_table[k].what,
423                           where,
424                           mount_table[k].type,
425                           mount_table[k].flags,
426                           mount_table[k].options) < 0 &&
427                     mount_table[k].fatal) {
428
429                         log_error("mount(%s) failed: %m", where);
430
431                         if (r == 0)
432                                 r = -errno;
433                 }
434         }
435
436         return r;
437 }
438
439 static int mount_binds(const char *dest, char **l, unsigned long flags) {
440         char **x, **y;
441
442         STRV_FOREACH_PAIR(x, y, l) {
443                 char *where;
444                 struct stat source_st, dest_st;
445                 int r;
446
447                 if (stat(*x, &source_st) < 0) {
448                         log_error("failed to stat %s: %m", *x);
449                         return -errno;
450                 }
451
452                 where = strappenda(dest, *y);
453                 r = stat(where, &dest_st);
454                 if (r == 0) {
455                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
456                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
457                                                 *x, where);
458                                 return -EINVAL;
459                         }
460                 } else if (errno == ENOENT) {
461                         r = mkdir_parents_label(where, 0755);
462                         if (r < 0) {
463                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
464                                 return r;
465                         }
466                 } else {
467                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
468                         return -errno;
469                 }
470                 /* Create the mount point, but be conservative -- refuse to create block
471                 * and char devices. */
472                 if (S_ISDIR(source_st.st_mode))
473                         mkdir_label(where, 0755);
474                 else if (S_ISFIFO(source_st.st_mode))
475                         mkfifo(where, 0644);
476                 else if (S_ISSOCK(source_st.st_mode))
477                         mknod(where, 0644 | S_IFSOCK, 0);
478                 else if (S_ISREG(source_st.st_mode))
479                         touch(where);
480                 else {
481                         log_error("Refusing to create mountpoint for file: %s", *x);
482                         return -ENOTSUP;
483                 }
484
485                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
486                         log_error("mount(%s) failed: %m", where);
487                         return -errno;
488                 }
489
490                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
491                         log_error("mount(%s) failed: %m", where);
492                         return -errno;
493                 }
494         }
495
496         return 0;
497 }
498
499 static int setup_timezone(const char *dest) {
500         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
501         char *z, *y;
502         int r;
503
504         assert(dest);
505
506         /* Fix the timezone, if possible */
507         r = readlink_malloc("/etc/localtime", &p);
508         if (r < 0) {
509                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
510                 return 0;
511         }
512
513         z = path_startswith(p, "../usr/share/zoneinfo/");
514         if (!z)
515                 z = path_startswith(p, "/usr/share/zoneinfo/");
516         if (!z) {
517                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
518                 return 0;
519         }
520
521         where = strappend(dest, "/etc/localtime");
522         if (!where)
523                 return log_oom();
524
525         r = readlink_malloc(where, &q);
526         if (r >= 0) {
527                 y = path_startswith(q, "../usr/share/zoneinfo/");
528                 if (!y)
529                         y = path_startswith(q, "/usr/share/zoneinfo/");
530
531
532                 /* Already pointing to the right place? Then do nothing .. */
533                 if (y && streq(y, z))
534                         return 0;
535         }
536
537         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
538         if (!check)
539                 return log_oom();
540
541         if (access(check, F_OK) < 0) {
542                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
543                 return 0;
544         }
545
546         what = strappend("../usr/share/zoneinfo/", z);
547         if (!what)
548                 return log_oom();
549
550         unlink(where);
551         if (symlink(what, where) < 0) {
552                 log_error("Failed to correct timezone of container: %m");
553                 return 0;
554         }
555
556         return 0;
557 }
558
559 static int setup_resolv_conf(const char *dest) {
560         char _cleanup_free_ *where = NULL;
561
562         assert(dest);
563
564         if (arg_private_network)
565                 return 0;
566
567         /* Fix resolv.conf, if possible */
568         where = strappend(dest, "/etc/resolv.conf");
569         if (!where)
570                 return log_oom();
571
572         /* We don't really care for the results of this really. If it
573          * fails, it fails, but meh... */
574         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
575
576         return 0;
577 }
578
579 static int setup_boot_id(const char *dest) {
580         _cleanup_free_ char *from = NULL, *to = NULL;
581         sd_id128_t rnd;
582         char as_uuid[37];
583         int r;
584
585         assert(dest);
586
587         /* Generate a new randomized boot ID, so that each boot-up of
588          * the container gets a new one */
589
590         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
591         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
592         if (!from || !to)
593                 return log_oom();
594
595         r = sd_id128_randomize(&rnd);
596         if (r < 0) {
597                 log_error("Failed to generate random boot id: %s", strerror(-r));
598                 return r;
599         }
600
601         snprintf(as_uuid, sizeof(as_uuid),
602                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
603                  SD_ID128_FORMAT_VAL(rnd));
604         char_array_0(as_uuid);
605
606         r = write_string_file(from, as_uuid);
607         if (r < 0) {
608                 log_error("Failed to write boot id: %s", strerror(-r));
609                 return r;
610         }
611
612         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
613                 log_error("Failed to bind mount boot id: %m");
614                 r = -errno;
615         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
616                 log_warning("Failed to make boot id read-only: %m");
617
618         unlink(from);
619         return r;
620 }
621
622 static int copy_devnodes(const char *dest) {
623
624         static const char devnodes[] =
625                 "null\0"
626                 "zero\0"
627                 "full\0"
628                 "random\0"
629                 "urandom\0"
630                 "tty\0";
631
632         const char *d;
633         int r = 0;
634         _cleanup_umask_ mode_t u;
635
636         assert(dest);
637
638         u = umask(0000);
639
640         NULSTR_FOREACH(d, devnodes) {
641                 struct stat st;
642                 _cleanup_free_ char *from = NULL, *to = NULL;
643
644                 asprintf(&from, "/dev/%s", d);
645                 asprintf(&to, "%s/dev/%s", dest, d);
646
647                 if (!from || !to) {
648                         log_oom();
649
650                         if (r == 0)
651                                 r = -ENOMEM;
652
653                         break;
654                 }
655
656                 if (stat(from, &st) < 0) {
657
658                         if (errno != ENOENT) {
659                                 log_error("Failed to stat %s: %m", from);
660                                 if (r == 0)
661                                         r = -errno;
662                         }
663
664                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
665
666                         log_error("%s is not a char or block device, cannot copy", from);
667                         if (r == 0)
668                                 r = -EIO;
669
670                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
671
672                         log_error("mknod(%s) failed: %m", dest);
673                         if (r == 0)
674                                 r = -errno;
675                 }
676         }
677
678         return r;
679 }
680
681 static int setup_ptmx(const char *dest) {
682         _cleanup_free_ char *p = NULL;
683
684         p = strappend(dest, "/dev/ptmx");
685         if (!p)
686                 return log_oom();
687
688         if (symlink("pts/ptmx", p) < 0) {
689                 log_error("Failed to create /dev/ptmx symlink: %m");
690                 return -errno;
691         }
692
693         return 0;
694 }
695
696 static int setup_dev_console(const char *dest, const char *console) {
697         struct stat st;
698         _cleanup_free_ char *to = NULL;
699         int r;
700         _cleanup_umask_ mode_t u;
701
702         assert(dest);
703         assert(console);
704
705         u = umask(0000);
706
707         if (stat(console, &st) < 0) {
708                 log_error("Failed to stat %s: %m", console);
709                 return -errno;
710
711         } else if (!S_ISCHR(st.st_mode)) {
712                 log_error("/dev/console is not a char device");
713                 return -EIO;
714         }
715
716         r = chmod_and_chown(console, 0600, 0, 0);
717         if (r < 0) {
718                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
719                 return r;
720         }
721
722         if (asprintf(&to, "%s/dev/console", dest) < 0)
723                 return log_oom();
724
725         /* We need to bind mount the right tty to /dev/console since
726          * ptys can only exist on pts file systems. To have something
727          * to bind mount things on we create a device node first, that
728          * has the right major/minor (note that the major minor
729          * doesn't actually matter here, since we mount it over
730          * anyway). */
731
732         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
733                 log_error("mknod() for /dev/console failed: %m");
734                 return -errno;
735         }
736
737         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
738                 log_error("Bind mount for /dev/console failed: %m");
739                 return -errno;
740         }
741
742         return 0;
743 }
744
745 static int setup_kmsg(const char *dest, int kmsg_socket) {
746         _cleanup_free_ char *from = NULL, *to = NULL;
747         int r, fd, k;
748         _cleanup_umask_ mode_t u;
749         union {
750                 struct cmsghdr cmsghdr;
751                 uint8_t buf[CMSG_SPACE(sizeof(int))];
752         } control = {};
753         struct msghdr mh = {
754                 .msg_control = &control,
755                 .msg_controllen = sizeof(control),
756         };
757         struct cmsghdr *cmsg;
758
759         assert(dest);
760         assert(kmsg_socket >= 0);
761
762         u = umask(0000);
763
764         /* We create the kmsg FIFO as /dev/kmsg, but immediately
765          * delete it after bind mounting it to /proc/kmsg. While FIFOs
766          * on the reading side behave very similar to /proc/kmsg,
767          * their writing side behaves differently from /dev/kmsg in
768          * that writing blocks when nothing is reading. In order to
769          * avoid any problems with containers deadlocking due to this
770          * we simply make /dev/kmsg unavailable to the container. */
771         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
772             asprintf(&to, "%s/proc/kmsg", dest) < 0)
773                 return log_oom();
774
775         if (mkfifo(from, 0600) < 0) {
776                 log_error("mkfifo() for /dev/kmsg failed: %m");
777                 return -errno;
778         }
779
780         r = chmod_and_chown(from, 0600, 0, 0);
781         if (r < 0) {
782                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
783                 return r;
784         }
785
786         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
787                 log_error("Bind mount for /proc/kmsg failed: %m");
788                 return -errno;
789         }
790
791         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
792         if (fd < 0) {
793                 log_error("Failed to open fifo: %m");
794                 return -errno;
795         }
796
797         cmsg = CMSG_FIRSTHDR(&mh);
798         cmsg->cmsg_level = SOL_SOCKET;
799         cmsg->cmsg_type = SCM_RIGHTS;
800         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
801         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
802
803         mh.msg_controllen = cmsg->cmsg_len;
804
805         /* Store away the fd in the socket, so that it stays open as
806          * long as we run the child */
807         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
808         close_nointr_nofail(fd);
809
810         if (k < 0) {
811                 log_error("Failed to send FIFO fd: %m");
812                 return -errno;
813         }
814
815         /* And now make the FIFO unavailable as /dev/kmsg... */
816         unlink(from);
817         return 0;
818 }
819
820 static int setup_hostname(void) {
821
822         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
823                 return -errno;
824
825         return 0;
826 }
827
828 static int setup_journal(const char *directory) {
829         sd_id128_t machine_id, this_id;
830         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
831         char *id;
832         int r;
833
834         p = strappend(directory, "/etc/machine-id");
835         if (!p)
836                 return log_oom();
837
838         r = read_one_line_file(p, &b);
839         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
840                 return 0;
841         else if (r < 0) {
842                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
843                 return r;
844         }
845
846         id = strstrip(b);
847         if (isempty(id) && arg_link_journal == LINK_AUTO)
848                 return 0;
849
850         /* Verify validity */
851         r = sd_id128_from_string(id, &machine_id);
852         if (r < 0) {
853                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
854                 return r;
855         }
856
857         r = sd_id128_get_machine(&this_id);
858         if (r < 0) {
859                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
860                 return r;
861         }
862
863         if (sd_id128_equal(machine_id, this_id)) {
864                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
865                          "Host and machine ids are equal (%s): refusing to link journals", id);
866                 if (arg_link_journal == LINK_AUTO)
867                         return 0;
868                 return
869                         -EEXIST;
870         }
871
872         if (arg_link_journal == LINK_NO)
873                 return 0;
874
875         free(p);
876         p = strappend("/var/log/journal/", id);
877         q = strjoin(directory, "/var/log/journal/", id, NULL);
878         if (!p || !q)
879                 return log_oom();
880
881         if (path_is_mount_point(p, false) > 0) {
882                 if (arg_link_journal != LINK_AUTO) {
883                         log_error("%s: already a mount point, refusing to use for journal", p);
884                         return -EEXIST;
885                 }
886
887                 return 0;
888         }
889
890         if (path_is_mount_point(q, false) > 0) {
891                 if (arg_link_journal != LINK_AUTO) {
892                         log_error("%s: already a mount point, refusing to use for journal", q);
893                         return -EEXIST;
894                 }
895
896                 return 0;
897         }
898
899         r = readlink_and_make_absolute(p, &d);
900         if (r >= 0) {
901                 if ((arg_link_journal == LINK_GUEST ||
902                      arg_link_journal == LINK_AUTO) &&
903                     path_equal(d, q)) {
904
905                         r = mkdir_p(q, 0755);
906                         if (r < 0)
907                                 log_warning("failed to create directory %s: %m", q);
908                         return 0;
909                 }
910
911                 if (unlink(p) < 0) {
912                         log_error("Failed to remove symlink %s: %m", p);
913                         return -errno;
914                 }
915         } else if (r == -EINVAL) {
916
917                 if (arg_link_journal == LINK_GUEST &&
918                     rmdir(p) < 0) {
919
920                         if (errno == ENOTDIR) {
921                                 log_error("%s already exists and is neither a symlink nor a directory", p);
922                                 return r;
923                         } else {
924                                 log_error("Failed to remove %s: %m", p);
925                                 return -errno;
926                         }
927                 }
928         } else if (r != -ENOENT) {
929                 log_error("readlink(%s) failed: %m", p);
930                 return r;
931         }
932
933         if (arg_link_journal == LINK_GUEST) {
934
935                 if (symlink(q, p) < 0) {
936                         log_error("Failed to symlink %s to %s: %m", q, p);
937                         return -errno;
938                 }
939
940                 r = mkdir_p(q, 0755);
941                 if (r < 0)
942                         log_warning("failed to create directory %s: %m", q);
943                 return 0;
944         }
945
946         if (arg_link_journal == LINK_HOST) {
947                 r = mkdir_p(p, 0755);
948                 if (r < 0) {
949                         log_error("Failed to create %s: %m", p);
950                         return r;
951                 }
952
953         } else if (access(p, F_OK) < 0)
954                 return 0;
955
956         if (dir_is_empty(q) == 0) {
957                 log_error("%s not empty.", q);
958                 return -ENOTEMPTY;
959         }
960
961         r = mkdir_p(q, 0755);
962         if (r < 0) {
963                 log_error("Failed to create %s: %m", q);
964                 return r;
965         }
966
967         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
968                 log_error("Failed to bind mount journal from host into guest: %m");
969                 return -errno;
970         }
971
972         return 0;
973 }
974
975 static int setup_kdbus(const char *dest, const char *path) {
976         const char *p;
977
978         if (!path)
979                 return 0;
980
981         p = strappenda(dest, "/dev/kdbus");
982         if (mkdir(p, 0755) < 0) {
983                 log_error("Failed to create kdbus path: %m");
984                 return  -errno;
985         }
986
987         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
988                 log_error("Failed to mount kdbus namespace path: %m");
989                 return -errno;
990         }
991
992         return 0;
993 }
994
995 static int drop_capabilities(void) {
996         return capability_bounding_set_drop(~arg_retain, false);
997 }
998
999 static int register_machine(void) {
1000         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1001         _cleanup_bus_unref_ sd_bus *bus = NULL;
1002         int r;
1003
1004         r = sd_bus_open_system(&bus);
1005         if (r < 0) {
1006                 log_error("Failed to open system bus: %s", strerror(-r));
1007                 return r;
1008         }
1009
1010         r = sd_bus_call_method(
1011                         bus,
1012                         "org.freedesktop.machine1",
1013                         "/org/freedesktop/machine1",
1014                         "org.freedesktop.machine1.Manager",
1015                         "CreateMachine",
1016                         &error,
1017                         NULL,
1018                         "sayssusa(sv)",
1019                         arg_machine,
1020                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1021                         "nspawn",
1022                         "container",
1023                         (uint32_t) 0,
1024                         strempty(arg_directory),
1025                         !isempty(arg_slice), "Slice", "s", arg_slice);
1026         if (r < 0) {
1027                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1028                 return r;
1029         }
1030
1031         return 0;
1032 }
1033
1034 static int terminate_machine(pid_t pid) {
1035         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1036         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1037         _cleanup_bus_unref_ sd_bus *bus = NULL;
1038         const char *path;
1039         int r;
1040
1041         r = sd_bus_default_system(&bus);
1042         if (r < 0) {
1043                 log_error("Failed to open system bus: %s", strerror(-r));
1044                 return r;
1045         }
1046
1047         r = sd_bus_call_method(
1048                         bus,
1049                         "org.freedesktop.machine1",
1050                         "/org/freedesktop/machine1",
1051                         "org.freedesktop.machine1.Manager",
1052                         "GetMachineByPID",
1053                         &error,
1054                         &reply,
1055                         "u",
1056                         (uint32_t) pid);
1057         if (r < 0) {
1058                 /* Note that the machine might already have been
1059                  * cleaned up automatically, hence don't consider it a
1060                  * failure if we cannot get the machine object. */
1061                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1062                 return 0;
1063         }
1064
1065         r = sd_bus_message_read(reply, "o", &path);
1066         if (r < 0)
1067                 return bus_log_parse_error(r);
1068
1069         r = sd_bus_call_method(
1070                         bus,
1071                         "org.freedesktop.machine1",
1072                         path,
1073                         "org.freedesktop.machine1.Machine",
1074                         "Terminate",
1075                         &error,
1076                         NULL,
1077                         NULL);
1078         if (r < 0) {
1079                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1080                 return 0;
1081         }
1082
1083         return 0;
1084 }
1085
1086 static bool audit_enabled(void) {
1087         int fd;
1088
1089         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1090         if (fd >= 0) {
1091                 close_nointr_nofail(fd);
1092                 return true;
1093         }
1094         return false;
1095 }
1096
1097 int main(int argc, char *argv[]) {
1098         pid_t pid = 0;
1099         int r = EXIT_FAILURE, k;
1100         _cleanup_close_ int master = -1, kdbus_fd = -1;
1101         int n_fd_passed;
1102         const char *console = NULL;
1103         sigset_t mask;
1104         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1105         _cleanup_fdset_free_ FDSet *fds = NULL;
1106         _cleanup_free_ char *kdbus_namespace = NULL;
1107         const char *ns;
1108
1109         log_parse_environment();
1110         log_open();
1111
1112         k = parse_argv(argc, argv);
1113         if (k < 0)
1114                 goto finish;
1115         else if (k == 0) {
1116                 r = EXIT_SUCCESS;
1117                 goto finish;
1118         }
1119
1120         if (arg_directory) {
1121                 char *p;
1122
1123                 p = path_make_absolute_cwd(arg_directory);
1124                 free(arg_directory);
1125                 arg_directory = p;
1126         } else
1127                 arg_directory = get_current_dir_name();
1128
1129         if (!arg_directory) {
1130                 log_error("Failed to determine path, please use -D.");
1131                 goto finish;
1132         }
1133
1134         path_kill_slashes(arg_directory);
1135
1136         if (!arg_machine) {
1137                 arg_machine = strdup(basename(arg_directory));
1138                 if (!arg_machine) {
1139                         log_oom();
1140                         goto finish;
1141                 }
1142
1143                 hostname_cleanup(arg_machine, false);
1144                 if (isempty(arg_machine)) {
1145                         log_error("Failed to determine machine name automatically, please use -M.");
1146                         goto finish;
1147                 }
1148         }
1149
1150         if (geteuid() != 0) {
1151                 log_error("Need to be root.");
1152                 goto finish;
1153         }
1154
1155         if (sd_booted() <= 0) {
1156                 log_error("Not running on a systemd system.");
1157                 goto finish;
1158         }
1159
1160         if (arg_boot && audit_enabled()) {
1161                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1162                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1163                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1164                 sleep(5);
1165         }
1166
1167         if (path_equal(arg_directory, "/")) {
1168                 log_error("Spawning container on root directory not supported.");
1169                 goto finish;
1170         }
1171
1172         if (path_is_os_tree(arg_directory) <= 0) {
1173                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1174                 goto finish;
1175         }
1176
1177         log_close();
1178         n_fd_passed = sd_listen_fds(false);
1179         if (n_fd_passed > 0) {
1180                 k = fdset_new_listen_fds(&fds, false);
1181                 if (k < 0) {
1182                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1183                         goto finish;
1184                 }
1185         }
1186         fdset_close_others(fds);
1187         log_open();
1188
1189         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1190         if (master < 0) {
1191                 log_error("Failed to acquire pseudo tty: %m");
1192                 goto finish;
1193         }
1194
1195         console = ptsname(master);
1196         if (!console) {
1197                 log_error("Failed to determine tty name: %m");
1198                 goto finish;
1199         }
1200
1201         log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1202
1203         if (unlockpt(master) < 0) {
1204                 log_error("Failed to unlock tty: %m");
1205                 goto finish;
1206         }
1207
1208         ns = strappenda("machine-", arg_machine);
1209         kdbus_fd = bus_kernel_create_namespace(ns, &kdbus_namespace);
1210         if (r < 0)
1211                 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1212         else
1213                 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1214
1215         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1216                 log_error("Failed to create kmsg socket pair.");
1217                 goto finish;
1218         }
1219
1220         sd_notify(0, "READY=1");
1221
1222         assert_se(sigemptyset(&mask) == 0);
1223         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1224         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1225
1226         for (;;) {
1227                 siginfo_t status;
1228
1229                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1230                 if (pid < 0) {
1231                         if (errno == EINVAL)
1232                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1233                         else
1234                                 log_error("clone() failed: %m");
1235
1236                         goto finish;
1237                 }
1238
1239                 if (pid == 0) {
1240                         /* child */
1241                         const char *home = NULL;
1242                         uid_t uid = (uid_t) -1;
1243                         gid_t gid = (gid_t) -1;
1244                         unsigned n_env = 2;
1245                         const char *envp[] = {
1246                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1247                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1248                                 NULL, /* TERM */
1249                                 NULL, /* HOME */
1250                                 NULL, /* USER */
1251                                 NULL, /* LOGNAME */
1252                                 NULL, /* container_uuid */
1253                                 NULL, /* LISTEN_FDS */
1254                                 NULL, /* LISTEN_PID */
1255                                 NULL
1256                         };
1257                         char **env_use;
1258
1259                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1260                         if (envp[n_env])
1261                                 n_env ++;
1262
1263                         close_nointr_nofail(master);
1264                         master = -1;
1265
1266                         close_nointr(STDIN_FILENO);
1267                         close_nointr(STDOUT_FILENO);
1268                         close_nointr(STDERR_FILENO);
1269
1270                         close_nointr_nofail(kmsg_socket_pair[0]);
1271                         kmsg_socket_pair[0] = -1;
1272
1273                         reset_all_signal_handlers();
1274
1275                         assert_se(sigemptyset(&mask) == 0);
1276                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1277
1278                         k = open_terminal(console, O_RDWR);
1279                         if (k != STDIN_FILENO) {
1280                                 if (k >= 0) {
1281                                         close_nointr_nofail(k);
1282                                         k = -EINVAL;
1283                                 }
1284
1285                                 log_error("Failed to open console: %s", strerror(-k));
1286                                 goto child_fail;
1287                         }
1288
1289                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1290                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1291                                 log_error("Failed to duplicate console: %m");
1292                                 goto child_fail;
1293                         }
1294
1295                         if (setsid() < 0) {
1296                                 log_error("setsid() failed: %m");
1297                                 goto child_fail;
1298                         }
1299
1300                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1301                                 log_error("PR_SET_PDEATHSIG failed: %m");
1302                                 goto child_fail;
1303                         }
1304
1305                         r = register_machine();
1306                         if (r < 0)
1307                                 goto finish;
1308
1309                         /* Mark everything as slave, so that we still
1310                          * receive mounts from the real root, but don't
1311                          * propagate mounts to the real root. */
1312                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1313                                 log_error("MS_SLAVE|MS_REC failed: %m");
1314                                 goto child_fail;
1315                         }
1316
1317                         /* Turn directory into bind mount */
1318                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1319                                 log_error("Failed to make bind mount.");
1320                                 goto child_fail;
1321                         }
1322
1323                         if (arg_read_only)
1324                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1325                                         log_error("Failed to make read-only.");
1326                                         goto child_fail;
1327                                 }
1328
1329                         if (mount_all(arg_directory) < 0)
1330                                 goto child_fail;
1331
1332                         if (copy_devnodes(arg_directory) < 0)
1333                                 goto child_fail;
1334
1335                         if (setup_ptmx(arg_directory) < 0)
1336                                 goto child_fail;
1337
1338                         dev_setup(arg_directory);
1339
1340                         if (setup_dev_console(arg_directory, console) < 0)
1341                                 goto child_fail;
1342
1343                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1344                                 goto child_fail;
1345
1346                         close_nointr_nofail(kmsg_socket_pair[1]);
1347                         kmsg_socket_pair[1] = -1;
1348
1349                         if (setup_boot_id(arg_directory) < 0)
1350                                 goto child_fail;
1351
1352                         if (setup_timezone(arg_directory) < 0)
1353                                 goto child_fail;
1354
1355                         if (setup_resolv_conf(arg_directory) < 0)
1356                                 goto child_fail;
1357
1358                         if (setup_journal(arg_directory) < 0)
1359                                 goto child_fail;
1360
1361                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1362                                 goto child_fail;
1363
1364                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1365                                 goto child_fail;
1366
1367                         if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1368                                 goto child_fail;
1369
1370                         if (chdir(arg_directory) < 0) {
1371                                 log_error("chdir(%s) failed: %m", arg_directory);
1372                                 goto child_fail;
1373                         }
1374
1375                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1376                                 log_error("mount(MS_MOVE) failed: %m");
1377                                 goto child_fail;
1378                         }
1379
1380                         if (chroot(".") < 0) {
1381                                 log_error("chroot() failed: %m");
1382                                 goto child_fail;
1383                         }
1384
1385                         if (chdir("/") < 0) {
1386                                 log_error("chdir() failed: %m");
1387                                 goto child_fail;
1388                         }
1389
1390                         umask(0022);
1391
1392                         loopback_setup();
1393
1394                         if (drop_capabilities() < 0) {
1395                                 log_error("drop_capabilities() failed: %m");
1396                                 goto child_fail;
1397                         }
1398
1399                         if (arg_user) {
1400
1401                                 /* Note that this resolves user names
1402                                  * inside the container, and hence
1403                                  * accesses the NSS modules from the
1404                                  * container and not the host. This is
1405                                  * a bit weird... */
1406
1407                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1408                                         log_error("get_user_creds() failed: %m");
1409                                         goto child_fail;
1410                                 }
1411
1412                                 if (mkdir_parents_label(home, 0775) < 0) {
1413                                         log_error("mkdir_parents_label() failed: %m");
1414                                         goto child_fail;
1415                                 }
1416
1417                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1418                                         log_error("mkdir_safe_label() failed: %m");
1419                                         goto child_fail;
1420                                 }
1421
1422                                 if (initgroups((const char*)arg_user, gid) < 0) {
1423                                         log_error("initgroups() failed: %m");
1424                                         goto child_fail;
1425                                 }
1426
1427                                 if (setresgid(gid, gid, gid) < 0) {
1428                                         log_error("setregid() failed: %m");
1429                                         goto child_fail;
1430                                 }
1431
1432                                 if (setresuid(uid, uid, uid) < 0) {
1433                                         log_error("setreuid() failed: %m");
1434                                         goto child_fail;
1435                                 }
1436                         } else {
1437                                 /* Reset everything fully to 0, just in case */
1438
1439                                 if (setgroups(0, NULL) < 0) {
1440                                         log_error("setgroups() failed: %m");
1441                                         goto child_fail;
1442                                 }
1443
1444                                 if (setresgid(0, 0, 0) < 0) {
1445                                         log_error("setregid() failed: %m");
1446                                         goto child_fail;
1447                                 }
1448
1449                                 if (setresuid(0, 0, 0) < 0) {
1450                                         log_error("setreuid() failed: %m");
1451                                         goto child_fail;
1452                                 }
1453                         }
1454
1455                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1456                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1457                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1458                                 log_oom();
1459                                 goto child_fail;
1460                         }
1461
1462                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1463                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1464                                         log_oom();
1465                                         goto child_fail;
1466                                 }
1467                         }
1468
1469                         if (fdset_size(fds) > 0) {
1470                                 k = fdset_cloexec(fds, false);
1471                                 if (k < 0) {
1472                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1473                                         goto child_fail;
1474                                 }
1475
1476                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1477                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1478                                         log_oom();
1479                                         goto child_fail;
1480                                 }
1481                         }
1482
1483                         setup_hostname();
1484
1485                         if (!strv_isempty(arg_setenv)) {
1486                                 char **n;
1487
1488                                 n = strv_env_merge(2, envp, arg_setenv);
1489                                 if (!n) {
1490                                         log_oom();
1491                                         goto child_fail;
1492                                 }
1493
1494                                 env_use = n;
1495                         } else
1496                                 env_use = (char**) envp;
1497
1498                         if (arg_boot) {
1499                                 char **a;
1500                                 size_t l;
1501
1502                                 /* Automatically search for the init system */
1503
1504                                 l = 1 + argc - optind;
1505                                 a = newa(char*, l + 1);
1506                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1507
1508                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1509                                 execve(a[0], a, env_use);
1510
1511                                 a[0] = (char*) "/lib/systemd/systemd";
1512                                 execve(a[0], a, env_use);
1513
1514                                 a[0] = (char*) "/sbin/init";
1515                                 execve(a[0], a, env_use);
1516                         } else if (argc > optind)
1517                                 execvpe(argv[optind], argv + optind, env_use);
1518                         else {
1519                                 chdir(home ? home : "/root");
1520                                 execle("/bin/bash", "-bash", NULL, env_use);
1521                         }
1522
1523                         log_error("execv() failed: %m");
1524
1525                 child_fail:
1526                         _exit(EXIT_FAILURE);
1527                 }
1528
1529                 fdset_free(fds);
1530                 fds = NULL;
1531
1532                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1533                 if (k < 0) {
1534                         r = EXIT_FAILURE;
1535                         break;
1536                 }
1537
1538                 putc('\n', stdout);
1539
1540                 /* Kill if it is not dead yet anyway */
1541                 terminate_machine(pid);
1542
1543                 /* Redundant, but better safe than sorry */
1544                 kill(pid, SIGKILL);
1545
1546                 k = wait_for_terminate(pid, &status);
1547                 pid = 0;
1548
1549                 if (k < 0) {
1550                         r = EXIT_FAILURE;
1551                         break;
1552                 }
1553
1554                 if (status.si_code == CLD_EXITED) {
1555                         r = status.si_status;
1556                         if (status.si_status != 0) {
1557                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1558                                 break;
1559                         }
1560
1561                         log_debug("Container %s exited successfully.", arg_machine);
1562                         break;
1563                 } else if (status.si_code == CLD_KILLED &&
1564                            status.si_status == SIGINT) {
1565                         log_info("Container %s has been shut down.", arg_machine);
1566                         r = 0;
1567                         break;
1568                 } else if (status.si_code == CLD_KILLED &&
1569                            status.si_status == SIGHUP) {
1570                         log_info("Container %s is being rebooted.", arg_machine);
1571                         continue;
1572                 } else if (status.si_code == CLD_KILLED ||
1573                            status.si_code == CLD_DUMPED) {
1574
1575                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1576                         r = EXIT_FAILURE;
1577                         break;
1578                 } else {
1579                         log_error("Container %s failed due to unknown reason.", arg_machine);
1580                         r = EXIT_FAILURE;
1581                         break;
1582                 }
1583         }
1584
1585 finish:
1586         if (pid > 0)
1587                 kill(pid, SIGKILL);
1588
1589         free(arg_directory);
1590         free(arg_machine);
1591         free(arg_setenv);
1592
1593         return r;
1594 }