chiark / gitweb /
57818f9bd044b472a63791dd521dd9e3004a3825
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #if HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #include "sd-daemon.h"
49 #include "sd-bus.h"
50 #include "sd-id128.h"
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
62 #include "fdset.h"
63 #include "build.h"
64 #include "fileio.h"
65 #include "bus-util.h"
66 #include "bus-error.h"
67 #include "ptyfwd.h"
68 #include "bus-kernel.h"
69 #include "env-util.h"
70 #include "def.h"
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_process_label = NULL;
84 static char *arg_file_label = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
91         (1ULL << CAP_CHOWN) |
92         (1ULL << CAP_DAC_OVERRIDE) |
93         (1ULL << CAP_DAC_READ_SEARCH) |
94         (1ULL << CAP_FOWNER) |
95         (1ULL << CAP_FSETID) |
96         (1ULL << CAP_IPC_OWNER) |
97         (1ULL << CAP_KILL) |
98         (1ULL << CAP_LEASE) |
99         (1ULL << CAP_LINUX_IMMUTABLE) |
100         (1ULL << CAP_NET_BIND_SERVICE) |
101         (1ULL << CAP_NET_BROADCAST) |
102         (1ULL << CAP_NET_RAW) |
103         (1ULL << CAP_SETGID) |
104         (1ULL << CAP_SETFCAP) |
105         (1ULL << CAP_SETPCAP) |
106         (1ULL << CAP_SETUID) |
107         (1ULL << CAP_SYS_ADMIN) |
108         (1ULL << CAP_SYS_CHROOT) |
109         (1ULL << CAP_SYS_NICE) |
110         (1ULL << CAP_SYS_PTRACE) |
111         (1ULL << CAP_SYS_TTY_CONFIG) |
112         (1ULL << CAP_SYS_RESOURCE) |
113         (1ULL << CAP_SYS_BOOT) |
114         (1ULL << CAP_AUDIT_WRITE) |
115         (1ULL << CAP_AUDIT_CONTROL) |
116         (1ULL << CAP_MKNOD);
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121
122 static int help(void) {
123
124         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
125                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
126                "  -h --help                 Show this help\n"
127                "     --version              Print version string\n"
128                "  -D --directory=NAME       Root directory for the container\n"
129                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
130                "  -u --user=USER            Run the command under specified user or uid\n"
131                "     --uuid=UUID            Set a specific machine UUID for the container\n"
132                "  -M --machine=NAME         Set the machine name for the container\n"
133                "  -S --slice=SLICE          Place the container in the specified slice\n"
134                "  -L --file-label=LABEL     Set the MAC file label to be used by tmpfs file\n"
135                "                            systems in the container\n"
136                "  -Z --process-label=LABEL  Set the MAC label to be used by processes in\n"
137                "                            the container\n"
138                "     --private-network      Disable network in container\n"
139                "     --read-only            Mount the root directory read-only\n"
140                "     --capability=CAP       In addition to the default, retain specified\n"
141                "                            capability\n"
142                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
143                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
144                "  -j                        Equivalent to --link-journal=host\n"
145                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
146                "                            the container\n"
147                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
148                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
149                "  -q --quiet                Do not show status information\n",
150                program_invocation_short_name);
151
152         return 0;
153 }
154
155 static int parse_argv(int argc, char *argv[]) {
156
157         enum {
158                 ARG_VERSION = 0x100,
159                 ARG_PRIVATE_NETWORK,
160                 ARG_UUID,
161                 ARG_READ_ONLY,
162                 ARG_CAPABILITY,
163                 ARG_DROP_CAPABILITY,
164                 ARG_LINK_JOURNAL,
165                 ARG_BIND,
166                 ARG_BIND_RO,
167                 ARG_SETENV,
168         };
169
170         static const struct option options[] = {
171                 { "help",            no_argument,       NULL, 'h'                 },
172                 { "version",         no_argument,       NULL, ARG_VERSION         },
173                 { "directory",       required_argument, NULL, 'D'                 },
174                 { "user",            required_argument, NULL, 'u'                 },
175                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
176                 { "boot",            no_argument,       NULL, 'b'                 },
177                 { "uuid",            required_argument, NULL, ARG_UUID            },
178                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
179                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
180                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
181                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
182                 { "bind",            required_argument, NULL, ARG_BIND            },
183                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
184                 { "machine",         required_argument, NULL, 'M'                 },
185                 { "slice",           required_argument, NULL, 'S'                 },
186                 { "setenv",          required_argument, NULL, ARG_SETENV          },
187                 { "process-label",   required_argument, NULL, 'Z'                 },
188                 { "file-label",      required_argument, NULL, 'L'                 },
189                 { "quiet",           no_argument,       NULL, 'q'                 },
190                 {}
191         };
192
193         int c, r;
194
195         assert(argc >= 0);
196         assert(argv);
197
198         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
199
200                 switch (c) {
201
202                 case 'h':
203                         return help();
204
205                 case ARG_VERSION:
206                         puts(PACKAGE_STRING);
207                         puts(SYSTEMD_FEATURES);
208                         return 0;
209
210                 case 'D':
211                         free(arg_directory);
212                         arg_directory = canonicalize_file_name(optarg);
213                         if (!arg_directory) {
214                                 log_error("Invalid root directory: %m");
215                                 return -ENOMEM;
216                         }
217
218                         break;
219
220                 case 'u':
221                         free(arg_user);
222                         arg_user = strdup(optarg);
223                         if (!arg_user)
224                                 return log_oom();
225
226                         break;
227
228                 case ARG_PRIVATE_NETWORK:
229                         arg_private_network = true;
230                         break;
231
232                 case 'b':
233                         arg_boot = true;
234                         break;
235
236                 case ARG_UUID:
237                         r = sd_id128_from_string(optarg, &arg_uuid);
238                         if (r < 0) {
239                                 log_error("Invalid UUID: %s", optarg);
240                                 return r;
241                         }
242                         break;
243
244                 case 'S':
245                         arg_slice = strdup(optarg);
246                         if (!arg_slice)
247                                 return log_oom();
248
249                         break;
250
251                 case 'M':
252                         if (!hostname_is_valid(optarg)) {
253                                 log_error("Invalid machine name: %s", optarg);
254                                 return -EINVAL;
255                         }
256
257                         free(arg_machine);
258                         arg_machine = strdup(optarg);
259                         if (!arg_machine)
260                                 return log_oom();
261
262                         break;
263
264                 case 'L':
265                         arg_file_label = optarg;
266                         break;
267
268                 case 'Z':
269                         arg_process_label = optarg;
270                         break;
271
272                 case ARG_READ_ONLY:
273                         arg_read_only = true;
274                         break;
275
276                 case ARG_CAPABILITY:
277                 case ARG_DROP_CAPABILITY: {
278                         char *state, *word;
279                         size_t length;
280
281                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
282                                 cap_value_t cap;
283                                 char *t;
284
285                                 t = strndup(word, length);
286                                 if (!t)
287                                         return log_oom();
288
289                                 if (cap_from_name(t, &cap) < 0) {
290                                         log_error("Failed to parse capability %s.", t);
291                                         free(t);
292                                         return -EINVAL;
293                                 }
294
295                                 free(t);
296
297                                 if (c == ARG_CAPABILITY)
298                                         arg_retain |= 1ULL << (uint64_t) cap;
299                                 else
300                                         arg_retain &= ~(1ULL << (uint64_t) cap);
301                         }
302
303                         break;
304                 }
305
306                 case 'j':
307                         arg_link_journal = LINK_GUEST;
308                         break;
309
310                 case ARG_LINK_JOURNAL:
311                         if (streq(optarg, "auto"))
312                                 arg_link_journal = LINK_AUTO;
313                         else if (streq(optarg, "no"))
314                                 arg_link_journal = LINK_NO;
315                         else if (streq(optarg, "guest"))
316                                 arg_link_journal = LINK_GUEST;
317                         else if (streq(optarg, "host"))
318                                 arg_link_journal = LINK_HOST;
319                         else {
320                                 log_error("Failed to parse link journal mode %s", optarg);
321                                 return -EINVAL;
322                         }
323
324                         break;
325
326                 case ARG_BIND:
327                 case ARG_BIND_RO: {
328                         _cleanup_free_ char *a = NULL, *b = NULL;
329                         char *e;
330                         char ***x;
331
332                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
333
334                         e = strchr(optarg, ':');
335                         if (e) {
336                                 a = strndup(optarg, e - optarg);
337                                 b = strdup(e + 1);
338                         } else {
339                                 a = strdup(optarg);
340                                 b = strdup(optarg);
341                         }
342
343                         if (!a || !b)
344                                 return log_oom();
345
346                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
347                                 log_error("Invalid bind mount specification: %s", optarg);
348                                 return -EINVAL;
349                         }
350
351                         r = strv_extend(x, a);
352                         if (r < 0)
353                                 return log_oom();
354
355                         r = strv_extend(x, b);
356                         if (r < 0)
357                                 return log_oom();
358
359                         break;
360                 }
361
362                 case ARG_SETENV: {
363                         char **n;
364
365                         if (!env_assignment_is_valid(optarg)) {
366                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
367                                 return -EINVAL;
368                         }
369
370                         n = strv_env_set(arg_setenv, optarg);
371                         if (!n)
372                                 return log_oom();
373
374                         strv_free(arg_setenv);
375                         arg_setenv = n;
376                         break;
377                 }
378
379                 case 'q':
380                         arg_quiet = true;
381                         break;
382
383                 case '?':
384                         return -EINVAL;
385
386                 default:
387                         assert_not_reached("Unhandled option");
388                 }
389         }
390
391         return 1;
392 }
393
394 static int mount_all(const char *dest) {
395
396         typedef struct MountPoint {
397                 const char *what;
398                 const char *where;
399                 const char *type;
400                 const char *options;
401                 unsigned long flags;
402                 bool fatal;
403         } MountPoint;
404
405         static const MountPoint mount_table[] = {
406                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
407                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
408                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
409                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
410                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
411                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
412                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
413                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
414 #ifdef HAVE_SELINUX
415                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
416                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
417 #endif
418         };
419
420         unsigned k;
421         int r = 0;
422
423         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
424                 _cleanup_free_ char *where = NULL;
425 #ifdef HAVE_SELINUX
426                 _cleanup_free_ char *options = NULL;
427 #endif
428                 const char *o;
429                 int t;
430
431                 where = strjoin(dest, "/", mount_table[k].where, NULL);
432                 if (!where)
433                         return log_oom();
434
435                 t = path_is_mount_point(where, true);
436                 if (t < 0) {
437                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
438
439                         if (r == 0)
440                                 r = t;
441
442                         continue;
443                 }
444
445                 /* Skip this entry if it is not a remount. */
446                 if (mount_table[k].what && t > 0)
447                         continue;
448
449                 mkdir_p(where, 0755);
450
451 #ifdef HAVE_SELINUX
452                 if (arg_file_label && (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
453                         options = strjoin(mount_table[k].options, ",context=\"", arg_file_label, "\"", NULL);
454                         if (!options)
455                                 return log_oom();
456
457                         o = options;
458                 } else
459 #endif
460                         o = mount_table[k].options;
461
462
463                 if (mount(mount_table[k].what,
464                           where,
465                           mount_table[k].type,
466                           mount_table[k].flags,
467                           o) < 0 &&
468                     mount_table[k].fatal) {
469
470                         log_error("mount(%s) failed: %m", where);
471
472                         if (r == 0)
473                                 r = -errno;
474                 }
475         }
476
477         return r;
478 }
479
480 static int mount_binds(const char *dest, char **l, unsigned long flags) {
481         char **x, **y;
482
483         STRV_FOREACH_PAIR(x, y, l) {
484                 char *where;
485                 struct stat source_st, dest_st;
486                 int r;
487
488                 if (stat(*x, &source_st) < 0) {
489                         log_error("failed to stat %s: %m", *x);
490                         return -errno;
491                 }
492
493                 where = strappenda(dest, *y);
494                 r = stat(where, &dest_st);
495                 if (r == 0) {
496                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
497                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
498                                                 *x, where);
499                                 return -EINVAL;
500                         }
501                 } else if (errno == ENOENT) {
502                         r = mkdir_parents_label(where, 0755);
503                         if (r < 0) {
504                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
505                                 return r;
506                         }
507                 } else {
508                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
509                         return -errno;
510                 }
511                 /* Create the mount point, but be conservative -- refuse to create block
512                 * and char devices. */
513                 if (S_ISDIR(source_st.st_mode))
514                         mkdir_label(where, 0755);
515                 else if (S_ISFIFO(source_st.st_mode))
516                         mkfifo(where, 0644);
517                 else if (S_ISSOCK(source_st.st_mode))
518                         mknod(where, 0644 | S_IFSOCK, 0);
519                 else if (S_ISREG(source_st.st_mode))
520                         touch(where);
521                 else {
522                         log_error("Refusing to create mountpoint for file: %s", *x);
523                         return -ENOTSUP;
524                 }
525
526                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
527                         log_error("mount(%s) failed: %m", where);
528                         return -errno;
529                 }
530
531                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
532                         log_error("mount(%s) failed: %m", where);
533                         return -errno;
534                 }
535         }
536
537         return 0;
538 }
539
540 static int setup_timezone(const char *dest) {
541         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
542         char *z, *y;
543         int r;
544
545         assert(dest);
546
547         /* Fix the timezone, if possible */
548         r = readlink_malloc("/etc/localtime", &p);
549         if (r < 0) {
550                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
551                 return 0;
552         }
553
554         z = path_startswith(p, "../usr/share/zoneinfo/");
555         if (!z)
556                 z = path_startswith(p, "/usr/share/zoneinfo/");
557         if (!z) {
558                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
559                 return 0;
560         }
561
562         where = strappend(dest, "/etc/localtime");
563         if (!where)
564                 return log_oom();
565
566         r = readlink_malloc(where, &q);
567         if (r >= 0) {
568                 y = path_startswith(q, "../usr/share/zoneinfo/");
569                 if (!y)
570                         y = path_startswith(q, "/usr/share/zoneinfo/");
571
572
573                 /* Already pointing to the right place? Then do nothing .. */
574                 if (y && streq(y, z))
575                         return 0;
576         }
577
578         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
579         if (!check)
580                 return log_oom();
581
582         if (access(check, F_OK) < 0) {
583                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
584                 return 0;
585         }
586
587         what = strappend("../usr/share/zoneinfo/", z);
588         if (!what)
589                 return log_oom();
590
591         unlink(where);
592         if (symlink(what, where) < 0) {
593                 log_error("Failed to correct timezone of container: %m");
594                 return 0;
595         }
596
597         return 0;
598 }
599
600 static int setup_resolv_conf(const char *dest) {
601         char _cleanup_free_ *where = NULL;
602
603         assert(dest);
604
605         if (arg_private_network)
606                 return 0;
607
608         /* Fix resolv.conf, if possible */
609         where = strappend(dest, "/etc/resolv.conf");
610         if (!where)
611                 return log_oom();
612
613         /* We don't really care for the results of this really. If it
614          * fails, it fails, but meh... */
615         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
616
617         return 0;
618 }
619
620 static int setup_boot_id(const char *dest) {
621         _cleanup_free_ char *from = NULL, *to = NULL;
622         sd_id128_t rnd;
623         char as_uuid[37];
624         int r;
625
626         assert(dest);
627
628         /* Generate a new randomized boot ID, so that each boot-up of
629          * the container gets a new one */
630
631         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
632         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
633         if (!from || !to)
634                 return log_oom();
635
636         r = sd_id128_randomize(&rnd);
637         if (r < 0) {
638                 log_error("Failed to generate random boot id: %s", strerror(-r));
639                 return r;
640         }
641
642         snprintf(as_uuid, sizeof(as_uuid),
643                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
644                  SD_ID128_FORMAT_VAL(rnd));
645         char_array_0(as_uuid);
646
647         r = write_string_file(from, as_uuid);
648         if (r < 0) {
649                 log_error("Failed to write boot id: %s", strerror(-r));
650                 return r;
651         }
652
653         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
654                 log_error("Failed to bind mount boot id: %m");
655                 r = -errno;
656         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
657                 log_warning("Failed to make boot id read-only: %m");
658
659         unlink(from);
660         return r;
661 }
662
663 static int copy_devnodes(const char *dest) {
664
665         static const char devnodes[] =
666                 "null\0"
667                 "zero\0"
668                 "full\0"
669                 "random\0"
670                 "urandom\0"
671                 "tty\0";
672
673         const char *d;
674         int r = 0;
675         _cleanup_umask_ mode_t u;
676
677         assert(dest);
678
679         u = umask(0000);
680
681         NULSTR_FOREACH(d, devnodes) {
682                 _cleanup_free_ char *from = NULL, *to = NULL;
683                 struct stat st;
684
685                 from = strappend("/dev/", d);
686                 to = strjoin(dest, "/dev/", d, NULL);
687                 if (!from || !to)
688                         return log_oom();
689
690                 if (stat(from, &st) < 0) {
691
692                         if (errno != ENOENT) {
693                                 log_error("Failed to stat %s: %m", from);
694                                 return -errno;
695                         }
696
697                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
698
699                         log_error("%s is not a char or block device, cannot copy", from);
700                         return -EIO;
701
702                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
703
704                         log_error("mknod(%s) failed: %m", dest);
705                         return  -errno;
706                 }
707         }
708
709         return r;
710 }
711
712 static int setup_ptmx(const char *dest) {
713         _cleanup_free_ char *p = NULL;
714
715         p = strappend(dest, "/dev/ptmx");
716         if (!p)
717                 return log_oom();
718
719         if (symlink("pts/ptmx", p) < 0) {
720                 log_error("Failed to create /dev/ptmx symlink: %m");
721                 return -errno;
722         }
723
724         return 0;
725 }
726
727 static int setup_dev_console(const char *dest, const char *console) {
728         struct stat st;
729         _cleanup_free_ char *to = NULL;
730         int r;
731         _cleanup_umask_ mode_t u;
732
733         assert(dest);
734         assert(console);
735
736         u = umask(0000);
737
738         if (stat(console, &st) < 0) {
739                 log_error("Failed to stat %s: %m", console);
740                 return -errno;
741
742         } else if (!S_ISCHR(st.st_mode)) {
743                 log_error("/dev/console is not a char device");
744                 return -EIO;
745         }
746
747         r = chmod_and_chown(console, 0600, 0, 0);
748         if (r < 0) {
749                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
750                 return r;
751         }
752
753         if (asprintf(&to, "%s/dev/console", dest) < 0)
754                 return log_oom();
755
756         /* We need to bind mount the right tty to /dev/console since
757          * ptys can only exist on pts file systems. To have something
758          * to bind mount things on we create a device node first, that
759          * has the right major/minor (note that the major minor
760          * doesn't actually matter here, since we mount it over
761          * anyway). */
762
763         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
764                 log_error("mknod() for /dev/console failed: %m");
765                 return -errno;
766         }
767
768         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
769                 log_error("Bind mount for /dev/console failed: %m");
770                 return -errno;
771         }
772
773         return 0;
774 }
775
776 static int setup_kmsg(const char *dest, int kmsg_socket) {
777         _cleanup_free_ char *from = NULL, *to = NULL;
778         int r, fd, k;
779         _cleanup_umask_ mode_t u;
780         union {
781                 struct cmsghdr cmsghdr;
782                 uint8_t buf[CMSG_SPACE(sizeof(int))];
783         } control = {};
784         struct msghdr mh = {
785                 .msg_control = &control,
786                 .msg_controllen = sizeof(control),
787         };
788         struct cmsghdr *cmsg;
789
790         assert(dest);
791         assert(kmsg_socket >= 0);
792
793         u = umask(0000);
794
795         /* We create the kmsg FIFO as /dev/kmsg, but immediately
796          * delete it after bind mounting it to /proc/kmsg. While FIFOs
797          * on the reading side behave very similar to /proc/kmsg,
798          * their writing side behaves differently from /dev/kmsg in
799          * that writing blocks when nothing is reading. In order to
800          * avoid any problems with containers deadlocking due to this
801          * we simply make /dev/kmsg unavailable to the container. */
802         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
803             asprintf(&to, "%s/proc/kmsg", dest) < 0)
804                 return log_oom();
805
806         if (mkfifo(from, 0600) < 0) {
807                 log_error("mkfifo() for /dev/kmsg failed: %m");
808                 return -errno;
809         }
810
811         r = chmod_and_chown(from, 0600, 0, 0);
812         if (r < 0) {
813                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
814                 return r;
815         }
816
817         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
818                 log_error("Bind mount for /proc/kmsg failed: %m");
819                 return -errno;
820         }
821
822         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
823         if (fd < 0) {
824                 log_error("Failed to open fifo: %m");
825                 return -errno;
826         }
827
828         cmsg = CMSG_FIRSTHDR(&mh);
829         cmsg->cmsg_level = SOL_SOCKET;
830         cmsg->cmsg_type = SCM_RIGHTS;
831         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
832         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
833
834         mh.msg_controllen = cmsg->cmsg_len;
835
836         /* Store away the fd in the socket, so that it stays open as
837          * long as we run the child */
838         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
839         close_nointr_nofail(fd);
840
841         if (k < 0) {
842                 log_error("Failed to send FIFO fd: %m");
843                 return -errno;
844         }
845
846         /* And now make the FIFO unavailable as /dev/kmsg... */
847         unlink(from);
848         return 0;
849 }
850
851 static int setup_hostname(void) {
852
853         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
854                 return -errno;
855
856         return 0;
857 }
858
859 static int setup_journal(const char *directory) {
860         sd_id128_t machine_id, this_id;
861         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
862         char *id;
863         int r;
864
865         p = strappend(directory, "/etc/machine-id");
866         if (!p)
867                 return log_oom();
868
869         r = read_one_line_file(p, &b);
870         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
871                 return 0;
872         else if (r < 0) {
873                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
874                 return r;
875         }
876
877         id = strstrip(b);
878         if (isempty(id) && arg_link_journal == LINK_AUTO)
879                 return 0;
880
881         /* Verify validity */
882         r = sd_id128_from_string(id, &machine_id);
883         if (r < 0) {
884                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
885                 return r;
886         }
887
888         r = sd_id128_get_machine(&this_id);
889         if (r < 0) {
890                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
891                 return r;
892         }
893
894         if (sd_id128_equal(machine_id, this_id)) {
895                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
896                          "Host and machine ids are equal (%s): refusing to link journals", id);
897                 if (arg_link_journal == LINK_AUTO)
898                         return 0;
899                 return
900                         -EEXIST;
901         }
902
903         if (arg_link_journal == LINK_NO)
904                 return 0;
905
906         free(p);
907         p = strappend("/var/log/journal/", id);
908         q = strjoin(directory, "/var/log/journal/", id, NULL);
909         if (!p || !q)
910                 return log_oom();
911
912         if (path_is_mount_point(p, false) > 0) {
913                 if (arg_link_journal != LINK_AUTO) {
914                         log_error("%s: already a mount point, refusing to use for journal", p);
915                         return -EEXIST;
916                 }
917
918                 return 0;
919         }
920
921         if (path_is_mount_point(q, false) > 0) {
922                 if (arg_link_journal != LINK_AUTO) {
923                         log_error("%s: already a mount point, refusing to use for journal", q);
924                         return -EEXIST;
925                 }
926
927                 return 0;
928         }
929
930         r = readlink_and_make_absolute(p, &d);
931         if (r >= 0) {
932                 if ((arg_link_journal == LINK_GUEST ||
933                      arg_link_journal == LINK_AUTO) &&
934                     path_equal(d, q)) {
935
936                         r = mkdir_p(q, 0755);
937                         if (r < 0)
938                                 log_warning("failed to create directory %s: %m", q);
939                         return 0;
940                 }
941
942                 if (unlink(p) < 0) {
943                         log_error("Failed to remove symlink %s: %m", p);
944                         return -errno;
945                 }
946         } else if (r == -EINVAL) {
947
948                 if (arg_link_journal == LINK_GUEST &&
949                     rmdir(p) < 0) {
950
951                         if (errno == ENOTDIR) {
952                                 log_error("%s already exists and is neither a symlink nor a directory", p);
953                                 return r;
954                         } else {
955                                 log_error("Failed to remove %s: %m", p);
956                                 return -errno;
957                         }
958                 }
959         } else if (r != -ENOENT) {
960                 log_error("readlink(%s) failed: %m", p);
961                 return r;
962         }
963
964         if (arg_link_journal == LINK_GUEST) {
965
966                 if (symlink(q, p) < 0) {
967                         log_error("Failed to symlink %s to %s: %m", q, p);
968                         return -errno;
969                 }
970
971                 r = mkdir_p(q, 0755);
972                 if (r < 0)
973                         log_warning("failed to create directory %s: %m", q);
974                 return 0;
975         }
976
977         if (arg_link_journal == LINK_HOST) {
978                 r = mkdir_p(p, 0755);
979                 if (r < 0) {
980                         log_error("Failed to create %s: %m", p);
981                         return r;
982                 }
983
984         } else if (access(p, F_OK) < 0)
985                 return 0;
986
987         if (dir_is_empty(q) == 0) {
988                 log_error("%s not empty.", q);
989                 return -ENOTEMPTY;
990         }
991
992         r = mkdir_p(q, 0755);
993         if (r < 0) {
994                 log_error("Failed to create %s: %m", q);
995                 return r;
996         }
997
998         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
999                 log_error("Failed to bind mount journal from host into guest: %m");
1000                 return -errno;
1001         }
1002
1003         return 0;
1004 }
1005
1006 static int setup_kdbus(const char *dest, const char *path) {
1007         const char *p;
1008
1009         if (!path)
1010                 return 0;
1011
1012         p = strappenda(dest, "/dev/kdbus");
1013         if (mkdir(p, 0755) < 0) {
1014                 log_error("Failed to create kdbus path: %m");
1015                 return  -errno;
1016         }
1017
1018         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1019                 log_error("Failed to mount kdbus domain path: %m");
1020                 return -errno;
1021         }
1022
1023         return 0;
1024 }
1025
1026 static int drop_capabilities(void) {
1027         return capability_bounding_set_drop(~arg_retain, false);
1028 }
1029
1030 static int register_machine(pid_t pid) {
1031         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1032         _cleanup_bus_unref_ sd_bus *bus = NULL;
1033         int r;
1034
1035         r = sd_bus_default_system(&bus);
1036         if (r < 0) {
1037                 log_error("Failed to open system bus: %s", strerror(-r));
1038                 return r;
1039         }
1040
1041         r = sd_bus_call_method(
1042                         bus,
1043                         "org.freedesktop.machine1",
1044                         "/org/freedesktop/machine1",
1045                         "org.freedesktop.machine1.Manager",
1046                         "CreateMachine",
1047                         &error,
1048                         NULL,
1049                         "sayssusa(sv)",
1050                         arg_machine,
1051                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1052                         "nspawn",
1053                         "container",
1054                         (uint32_t) pid,
1055                         strempty(arg_directory),
1056                         !isempty(arg_slice), "Slice", "s", arg_slice);
1057         if (r < 0) {
1058                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1059                 return r;
1060         }
1061
1062         return 0;
1063 }
1064
1065 static int terminate_machine(pid_t pid) {
1066         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1067         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1068         _cleanup_bus_unref_ sd_bus *bus = NULL;
1069         const char *path;
1070         int r;
1071
1072         r = sd_bus_default_system(&bus);
1073         if (r < 0) {
1074                 log_error("Failed to open system bus: %s", strerror(-r));
1075                 return r;
1076         }
1077
1078         r = sd_bus_call_method(
1079                         bus,
1080                         "org.freedesktop.machine1",
1081                         "/org/freedesktop/machine1",
1082                         "org.freedesktop.machine1.Manager",
1083                         "GetMachineByPID",
1084                         &error,
1085                         &reply,
1086                         "u",
1087                         (uint32_t) pid);
1088         if (r < 0) {
1089                 /* Note that the machine might already have been
1090                  * cleaned up automatically, hence don't consider it a
1091                  * failure if we cannot get the machine object. */
1092                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1093                 return 0;
1094         }
1095
1096         r = sd_bus_message_read(reply, "o", &path);
1097         if (r < 0)
1098                 return bus_log_parse_error(r);
1099
1100         r = sd_bus_call_method(
1101                         bus,
1102                         "org.freedesktop.machine1",
1103                         path,
1104                         "org.freedesktop.machine1.Machine",
1105                         "Terminate",
1106                         &error,
1107                         NULL,
1108                         NULL);
1109         if (r < 0) {
1110                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1111                 return 0;
1112         }
1113
1114         return 0;
1115 }
1116
1117 static bool audit_enabled(void) {
1118         int fd;
1119
1120         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1121         if (fd >= 0) {
1122                 close_nointr_nofail(fd);
1123                 return true;
1124         }
1125         return false;
1126 }
1127
1128 int main(int argc, char *argv[]) {
1129         pid_t pid = 0;
1130         int r = EXIT_FAILURE, k;
1131         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1132         int n_fd_passed;
1133         const char *console = NULL;
1134         sigset_t mask;
1135         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1136         _cleanup_fdset_free_ FDSet *fds = NULL;
1137         _cleanup_free_ char *kdbus_domain = NULL;
1138         const char *ns;
1139
1140         log_parse_environment();
1141         log_open();
1142
1143         k = parse_argv(argc, argv);
1144         if (k < 0)
1145                 goto finish;
1146         else if (k == 0) {
1147                 r = EXIT_SUCCESS;
1148                 goto finish;
1149         }
1150
1151         if (arg_directory) {
1152                 char *p;
1153
1154                 p = path_make_absolute_cwd(arg_directory);
1155                 free(arg_directory);
1156                 arg_directory = p;
1157         } else
1158                 arg_directory = get_current_dir_name();
1159
1160         if (!arg_directory) {
1161                 log_error("Failed to determine path, please use -D.");
1162                 goto finish;
1163         }
1164
1165         path_kill_slashes(arg_directory);
1166
1167         if (!arg_machine) {
1168                 arg_machine = strdup(basename(arg_directory));
1169                 if (!arg_machine) {
1170                         log_oom();
1171                         goto finish;
1172                 }
1173
1174                 hostname_cleanup(arg_machine, false);
1175                 if (isempty(arg_machine)) {
1176                         log_error("Failed to determine machine name automatically, please use -M.");
1177                         goto finish;
1178                 }
1179         }
1180
1181         if (geteuid() != 0) {
1182                 log_error("Need to be root.");
1183                 goto finish;
1184         }
1185
1186         if (sd_booted() <= 0) {
1187                 log_error("Not running on a systemd system.");
1188                 goto finish;
1189         }
1190
1191         if (arg_boot && audit_enabled()) {
1192                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1193                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1194                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1195                 sleep(5);
1196         }
1197
1198         if (path_equal(arg_directory, "/")) {
1199                 log_error("Spawning container on root directory not supported.");
1200                 goto finish;
1201         }
1202
1203         if (path_is_os_tree(arg_directory) <= 0) {
1204                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1205                 goto finish;
1206         }
1207
1208         log_close();
1209         n_fd_passed = sd_listen_fds(false);
1210         if (n_fd_passed > 0) {
1211                 k = fdset_new_listen_fds(&fds, false);
1212                 if (k < 0) {
1213                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1214                         goto finish;
1215                 }
1216         }
1217         fdset_close_others(fds);
1218         log_open();
1219
1220         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1221         if (master < 0) {
1222                 log_error("Failed to acquire pseudo tty: %m");
1223                 goto finish;
1224         }
1225
1226         console = ptsname(master);
1227         if (!console) {
1228                 log_error("Failed to determine tty name: %m");
1229                 goto finish;
1230         }
1231
1232         if (!arg_quiet)
1233                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1234
1235         if (unlockpt(master) < 0) {
1236                 log_error("Failed to unlock tty: %m");
1237                 goto finish;
1238         }
1239
1240         ns = strappenda("machine-", arg_machine);
1241         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1242         if (r < 0)
1243                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1244         else
1245                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1246
1247         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1248                 log_error("Failed to create kmsg socket pair: %m");
1249                 goto finish;
1250         }
1251
1252         sd_notify(0, "READY=1");
1253
1254         assert_se(sigemptyset(&mask) == 0);
1255         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1256         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1257
1258         for (;;) {
1259                 siginfo_t status;
1260
1261                 sync_fd = eventfd(0, EFD_CLOEXEC);
1262                 if (sync_fd < 0) {
1263                         log_error("Failed to create event fd: %m");
1264                         goto finish;
1265                 }
1266
1267                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1268                 if (pid < 0) {
1269                         if (errno == EINVAL)
1270                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1271                         else
1272                                 log_error("clone() failed: %m");
1273
1274                         goto finish;
1275                 }
1276
1277                 if (pid == 0) {
1278                         /* child */
1279                         const char *home = NULL;
1280                         uid_t uid = (uid_t) -1;
1281                         gid_t gid = (gid_t) -1;
1282                         unsigned n_env = 2;
1283                         const char *envp[] = {
1284                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1285                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1286                                 NULL, /* TERM */
1287                                 NULL, /* HOME */
1288                                 NULL, /* USER */
1289                                 NULL, /* LOGNAME */
1290                                 NULL, /* container_uuid */
1291                                 NULL, /* LISTEN_FDS */
1292                                 NULL, /* LISTEN_PID */
1293                                 NULL
1294                         };
1295                         char **env_use;
1296                         eventfd_t x;
1297
1298                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1299                         if (envp[n_env])
1300                                 n_env ++;
1301
1302                         close_nointr_nofail(master);
1303                         master = -1;
1304
1305                         close_nointr(STDIN_FILENO);
1306                         close_nointr(STDOUT_FILENO);
1307                         close_nointr(STDERR_FILENO);
1308
1309                         close_nointr_nofail(kmsg_socket_pair[0]);
1310                         kmsg_socket_pair[0] = -1;
1311
1312                         reset_all_signal_handlers();
1313
1314                         assert_se(sigemptyset(&mask) == 0);
1315                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1316
1317                         k = open_terminal(console, O_RDWR);
1318                         if (k != STDIN_FILENO) {
1319                                 if (k >= 0) {
1320                                         close_nointr_nofail(k);
1321                                         k = -EINVAL;
1322                                 }
1323
1324                                 log_error("Failed to open console: %s", strerror(-k));
1325                                 goto child_fail;
1326                         }
1327
1328                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1329                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1330                                 log_error("Failed to duplicate console: %m");
1331                                 goto child_fail;
1332                         }
1333
1334                         if (setsid() < 0) {
1335                                 log_error("setsid() failed: %m");
1336                                 goto child_fail;
1337                         }
1338
1339                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1340                                 log_error("PR_SET_PDEATHSIG failed: %m");
1341                                 goto child_fail;
1342                         }
1343
1344                         /* Mark everything as slave, so that we still
1345                          * receive mounts from the real root, but don't
1346                          * propagate mounts to the real root. */
1347                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1348                                 log_error("MS_SLAVE|MS_REC failed: %m");
1349                                 goto child_fail;
1350                         }
1351
1352                         /* Turn directory into bind mount */
1353                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1354                                 log_error("Failed to make bind mount.");
1355                                 goto child_fail;
1356                         }
1357
1358                         if (arg_read_only)
1359                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1360                                         log_error("Failed to make read-only.");
1361                                         goto child_fail;
1362                                 }
1363
1364                         if (mount_all(arg_directory) < 0)
1365                                 goto child_fail;
1366
1367                         if (copy_devnodes(arg_directory) < 0)
1368                                 goto child_fail;
1369
1370                         if (setup_ptmx(arg_directory) < 0)
1371                                 goto child_fail;
1372
1373                         dev_setup(arg_directory);
1374
1375                         if (setup_dev_console(arg_directory, console) < 0)
1376                                 goto child_fail;
1377
1378                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1379                                 goto child_fail;
1380
1381                         close_nointr_nofail(kmsg_socket_pair[1]);
1382                         kmsg_socket_pair[1] = -1;
1383
1384                         if (setup_boot_id(arg_directory) < 0)
1385                                 goto child_fail;
1386
1387                         if (setup_timezone(arg_directory) < 0)
1388                                 goto child_fail;
1389
1390                         if (setup_resolv_conf(arg_directory) < 0)
1391                                 goto child_fail;
1392
1393                         if (setup_journal(arg_directory) < 0)
1394                                 goto child_fail;
1395
1396                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1397                                 goto child_fail;
1398
1399                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1400                                 goto child_fail;
1401
1402                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1403                                 goto child_fail;
1404
1405                         if (chdir(arg_directory) < 0) {
1406                                 log_error("chdir(%s) failed: %m", arg_directory);
1407                                 goto child_fail;
1408                         }
1409
1410                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1411                                 log_error("mount(MS_MOVE) failed: %m");
1412                                 goto child_fail;
1413                         }
1414
1415                         if (chroot(".") < 0) {
1416                                 log_error("chroot() failed: %m");
1417                                 goto child_fail;
1418                         }
1419
1420                         if (chdir("/") < 0) {
1421                                 log_error("chdir() failed: %m");
1422                                 goto child_fail;
1423                         }
1424
1425                         umask(0022);
1426
1427                         loopback_setup();
1428
1429                         if (drop_capabilities() < 0) {
1430                                 log_error("drop_capabilities() failed: %m");
1431                                 goto child_fail;
1432                         }
1433
1434                         if (arg_user) {
1435
1436                                 /* Note that this resolves user names
1437                                  * inside the container, and hence
1438                                  * accesses the NSS modules from the
1439                                  * container and not the host. This is
1440                                  * a bit weird... */
1441
1442                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1443                                         log_error("get_user_creds() failed: %m");
1444                                         goto child_fail;
1445                                 }
1446
1447                                 if (mkdir_parents_label(home, 0775) < 0) {
1448                                         log_error("mkdir_parents_label() failed: %m");
1449                                         goto child_fail;
1450                                 }
1451
1452                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1453                                         log_error("mkdir_safe_label() failed: %m");
1454                                         goto child_fail;
1455                                 }
1456
1457                                 if (initgroups((const char*)arg_user, gid) < 0) {
1458                                         log_error("initgroups() failed: %m");
1459                                         goto child_fail;
1460                                 }
1461
1462                                 if (setresgid(gid, gid, gid) < 0) {
1463                                         log_error("setregid() failed: %m");
1464                                         goto child_fail;
1465                                 }
1466
1467                                 if (setresuid(uid, uid, uid) < 0) {
1468                                         log_error("setreuid() failed: %m");
1469                                         goto child_fail;
1470                                 }
1471                         } else {
1472                                 /* Reset everything fully to 0, just in case */
1473
1474                                 if (setgroups(0, NULL) < 0) {
1475                                         log_error("setgroups() failed: %m");
1476                                         goto child_fail;
1477                                 }
1478
1479                                 if (setresgid(0, 0, 0) < 0) {
1480                                         log_error("setregid() failed: %m");
1481                                         goto child_fail;
1482                                 }
1483
1484                                 if (setresuid(0, 0, 0) < 0) {
1485                                         log_error("setreuid() failed: %m");
1486                                         goto child_fail;
1487                                 }
1488                         }
1489
1490                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1491                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1492                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1493                                 log_oom();
1494                                 goto child_fail;
1495                         }
1496
1497                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1498                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1499                                         log_oom();
1500                                         goto child_fail;
1501                                 }
1502                         }
1503
1504                         if (fdset_size(fds) > 0) {
1505                                 k = fdset_cloexec(fds, false);
1506                                 if (k < 0) {
1507                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1508                                         goto child_fail;
1509                                 }
1510
1511                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1512                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1513                                         log_oom();
1514                                         goto child_fail;
1515                                 }
1516                         }
1517
1518                         setup_hostname();
1519
1520                         eventfd_read(sync_fd, &x);
1521                         close_nointr_nofail(sync_fd);
1522                         sync_fd = -1;
1523
1524                         if (!strv_isempty(arg_setenv)) {
1525                                 char **n;
1526
1527                                 n = strv_env_merge(2, envp, arg_setenv);
1528                                 if (!n) {
1529                                         log_oom();
1530                                         goto child_fail;
1531                                 }
1532
1533                                 env_use = n;
1534                         } else
1535                                 env_use = (char**) envp;
1536
1537 #if HAVE_SELINUX
1538                         if (arg_process_label)
1539                                 if (setexeccon(arg_process_label) < 0)
1540                                         log_error("setexeccon(\"%s\") failed: %m", arg_process_label);
1541 #endif
1542                         if (arg_boot) {
1543                                 char **a;
1544                                 size_t l;
1545
1546                                 /* Automatically search for the init system */
1547
1548                                 l = 1 + argc - optind;
1549                                 a = newa(char*, l + 1);
1550                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1551
1552                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1553                                 execve(a[0], a, env_use);
1554
1555                                 a[0] = (char*) "/lib/systemd/systemd";
1556                                 execve(a[0], a, env_use);
1557
1558                                 a[0] = (char*) "/sbin/init";
1559                                 execve(a[0], a, env_use);
1560                         } else if (argc > optind)
1561                                 execvpe(argv[optind], argv + optind, env_use);
1562                         else {
1563                                 chdir(home ? home : "/root");
1564                                 execle("/bin/bash", "-bash", NULL, env_use);
1565                         }
1566
1567                         log_error("execv() failed: %m");
1568
1569                 child_fail:
1570                         _exit(EXIT_FAILURE);
1571                 }
1572
1573                 fdset_free(fds);
1574                 fds = NULL;
1575
1576                 r = register_machine(pid);
1577                 if (r < 0)
1578                         goto finish;
1579
1580                 eventfd_write(sync_fd, 1);
1581                 close_nointr_nofail(sync_fd);
1582                 sync_fd = -1;
1583
1584                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1585                 if (k < 0) {
1586                         r = EXIT_FAILURE;
1587                         break;
1588                 }
1589
1590                 if (!arg_quiet)
1591                         putc('\n', stdout);
1592
1593                 /* Kill if it is not dead yet anyway */
1594                 terminate_machine(pid);
1595
1596                 /* Redundant, but better safe than sorry */
1597                 kill(pid, SIGKILL);
1598
1599                 k = wait_for_terminate(pid, &status);
1600                 pid = 0;
1601
1602                 if (k < 0) {
1603                         r = EXIT_FAILURE;
1604                         break;
1605                 }
1606
1607                 if (status.si_code == CLD_EXITED) {
1608                         r = status.si_status;
1609                         if (status.si_status != 0) {
1610                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1611                                 break;
1612                         }
1613
1614                         if (!arg_quiet)
1615                                 log_debug("Container %s exited successfully.", arg_machine);
1616                         break;
1617                 } else if (status.si_code == CLD_KILLED &&
1618                            status.si_status == SIGINT) {
1619
1620                         if (!arg_quiet)
1621                                 log_info("Container %s has been shut down.", arg_machine);
1622                         r = 0;
1623                         break;
1624                 } else if (status.si_code == CLD_KILLED &&
1625                            status.si_status == SIGHUP) {
1626
1627                         if (!arg_quiet)
1628                                 log_info("Container %s is being rebooted.", arg_machine);
1629                         continue;
1630                 } else if (status.si_code == CLD_KILLED ||
1631                            status.si_code == CLD_DUMPED) {
1632
1633                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1634                         r = EXIT_FAILURE;
1635                         break;
1636                 } else {
1637                         log_error("Container %s failed due to unknown reason.", arg_machine);
1638                         r = EXIT_FAILURE;
1639                         break;
1640                 }
1641         }
1642
1643 finish:
1644         if (pid > 0)
1645                 kill(pid, SIGKILL);
1646
1647         free(arg_directory);
1648         free(arg_machine);
1649         free(arg_setenv);
1650
1651         return r;
1652 }