chiark / gitweb /
util: modernize readlink_malloc() a bit
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #ifdef HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #include "sd-daemon.h"
49 #include "sd-bus.h"
50 #include "sd-id128.h"
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
62 #include "fdset.h"
63 #include "build.h"
64 #include "fileio.h"
65 #include "bus-util.h"
66 #include "bus-error.h"
67 #include "ptyfwd.h"
68 #include "bus-kernel.h"
69 #include "env-util.h"
70 #include "def.h"
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
91         (1ULL << CAP_CHOWN) |
92         (1ULL << CAP_DAC_OVERRIDE) |
93         (1ULL << CAP_DAC_READ_SEARCH) |
94         (1ULL << CAP_FOWNER) |
95         (1ULL << CAP_FSETID) |
96         (1ULL << CAP_IPC_OWNER) |
97         (1ULL << CAP_KILL) |
98         (1ULL << CAP_LEASE) |
99         (1ULL << CAP_LINUX_IMMUTABLE) |
100         (1ULL << CAP_NET_BIND_SERVICE) |
101         (1ULL << CAP_NET_BROADCAST) |
102         (1ULL << CAP_NET_RAW) |
103         (1ULL << CAP_SETGID) |
104         (1ULL << CAP_SETFCAP) |
105         (1ULL << CAP_SETPCAP) |
106         (1ULL << CAP_SETUID) |
107         (1ULL << CAP_SYS_ADMIN) |
108         (1ULL << CAP_SYS_CHROOT) |
109         (1ULL << CAP_SYS_NICE) |
110         (1ULL << CAP_SYS_PTRACE) |
111         (1ULL << CAP_SYS_TTY_CONFIG) |
112         (1ULL << CAP_SYS_RESOURCE) |
113         (1ULL << CAP_SYS_BOOT) |
114         (1ULL << CAP_AUDIT_WRITE) |
115         (1ULL << CAP_AUDIT_CONTROL) |
116         (1ULL << CAP_MKNOD);
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122 static bool arg_register = true;
123
124 static int help(void) {
125
126         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
127                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
128                "  -h --help                 Show this help\n"
129                "     --version              Print version string\n"
130                "  -D --directory=NAME       Root directory for the container\n"
131                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
132                "  -u --user=USER            Run the command under specified user or uid\n"
133                "     --uuid=UUID            Set a specific machine UUID for the container\n"
134                "  -M --machine=NAME         Set the machine name for the container\n"
135                "  -S --slice=SLICE          Place the container in the specified slice\n"
136                "  -Z --selinux-context=SECLABEL\n"
137                "                            Set the SELinux security context to be used by\n"
138                "                            processes in the container\n"
139                "  -L --selinux-apifs-context=SECLABEL\n"
140                "                            Set the SELinux security context to be used by\n"
141                "                            API/tmpfs file systems in the container\n"
142                "     --private-network      Disable network in container\n"
143                "     --share-system         Share system namespaces with host\n"
144                "     --read-only            Mount the root directory read-only\n"
145                "     --capability=CAP       In addition to the default, retain specified\n"
146                "                            capability\n"
147                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
148                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
149                "  -j                        Equivalent to --link-journal=host\n"
150                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
151                "                            the container\n"
152                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
153                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
154                "     --register=BOOLEAN     Register container as machine\n"
155                "  -q --quiet                Do not show status information\n",
156                program_invocation_short_name);
157
158         return 0;
159 }
160
161 static int parse_argv(int argc, char *argv[]) {
162
163         enum {
164                 ARG_VERSION = 0x100,
165                 ARG_PRIVATE_NETWORK,
166                 ARG_UUID,
167                 ARG_READ_ONLY,
168                 ARG_CAPABILITY,
169                 ARG_DROP_CAPABILITY,
170                 ARG_LINK_JOURNAL,
171                 ARG_BIND,
172                 ARG_BIND_RO,
173                 ARG_SETENV,
174                 ARG_SHARE_SYSTEM,
175                 ARG_REGISTER
176         };
177
178         static const struct option options[] = {
179                 { "help",                  no_argument,       NULL, 'h'                 },
180                 { "version",               no_argument,       NULL, ARG_VERSION         },
181                 { "directory",             required_argument, NULL, 'D'                 },
182                 { "user",                  required_argument, NULL, 'u'                 },
183                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK },
184                 { "boot",                  no_argument,       NULL, 'b'                 },
185                 { "uuid",                  required_argument, NULL, ARG_UUID            },
186                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY       },
187                 { "capability",            required_argument, NULL, ARG_CAPABILITY      },
188                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY },
189                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL    },
190                 { "bind",                  required_argument, NULL, ARG_BIND            },
191                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO         },
192                 { "machine",               required_argument, NULL, 'M'                 },
193                 { "slice",                 required_argument, NULL, 'S'                 },
194                 { "setenv",                required_argument, NULL, ARG_SETENV          },
195                 { "selinux-context",       required_argument, NULL, 'Z'                 },
196                 { "selinux-apifs-context", required_argument, NULL, 'L'                 },
197                 { "quiet",                 no_argument,       NULL, 'q'                 },
198                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM    },
199                 { "register",              required_argument, NULL, ARG_REGISTER        },
200                 {}
201         };
202
203         int c, r;
204
205         assert(argc >= 0);
206         assert(argv);
207
208         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
209
210                 switch (c) {
211
212                 case 'h':
213                         return help();
214
215                 case ARG_VERSION:
216                         puts(PACKAGE_STRING);
217                         puts(SYSTEMD_FEATURES);
218                         return 0;
219
220                 case 'D':
221                         free(arg_directory);
222                         arg_directory = canonicalize_file_name(optarg);
223                         if (!arg_directory) {
224                                 log_error("Invalid root directory: %m");
225                                 return -ENOMEM;
226                         }
227
228                         break;
229
230                 case 'u':
231                         free(arg_user);
232                         arg_user = strdup(optarg);
233                         if (!arg_user)
234                                 return log_oom();
235
236                         break;
237
238                 case ARG_PRIVATE_NETWORK:
239                         arg_private_network = true;
240                         break;
241
242                 case 'b':
243                         arg_boot = true;
244                         break;
245
246                 case ARG_UUID:
247                         r = sd_id128_from_string(optarg, &arg_uuid);
248                         if (r < 0) {
249                                 log_error("Invalid UUID: %s", optarg);
250                                 return r;
251                         }
252                         break;
253
254                 case 'S':
255                         arg_slice = strdup(optarg);
256                         if (!arg_slice)
257                                 return log_oom();
258
259                         break;
260
261                 case 'M':
262                         if (isempty(optarg)) {
263                                 free(arg_machine);
264                                 arg_machine = NULL;
265                         } else {
266
267                                 if (!hostname_is_valid(optarg)) {
268                                         log_error("Invalid machine name: %s", optarg);
269                                         return -EINVAL;
270                                 }
271
272                                 free(arg_machine);
273                                 arg_machine = strdup(optarg);
274                                 if (!arg_machine)
275                                         return log_oom();
276
277                                 break;
278                         }
279
280                 case 'Z':
281                         arg_selinux_context = optarg;
282                         break;
283
284                 case 'L':
285                         arg_selinux_apifs_context = optarg;
286                         break;
287
288                 case ARG_READ_ONLY:
289                         arg_read_only = true;
290                         break;
291
292                 case ARG_CAPABILITY:
293                 case ARG_DROP_CAPABILITY: {
294                         char *state, *word;
295                         size_t length;
296
297                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
298                                 cap_value_t cap;
299                                 char *t;
300
301                                 t = strndup(word, length);
302                                 if (!t)
303                                         return log_oom();
304
305                                 if (cap_from_name(t, &cap) < 0) {
306                                         log_error("Failed to parse capability %s.", t);
307                                         free(t);
308                                         return -EINVAL;
309                                 }
310
311                                 free(t);
312
313                                 if (c == ARG_CAPABILITY)
314                                         arg_retain |= 1ULL << (uint64_t) cap;
315                                 else
316                                         arg_retain &= ~(1ULL << (uint64_t) cap);
317                         }
318
319                         break;
320                 }
321
322                 case 'j':
323                         arg_link_journal = LINK_GUEST;
324                         break;
325
326                 case ARG_LINK_JOURNAL:
327                         if (streq(optarg, "auto"))
328                                 arg_link_journal = LINK_AUTO;
329                         else if (streq(optarg, "no"))
330                                 arg_link_journal = LINK_NO;
331                         else if (streq(optarg, "guest"))
332                                 arg_link_journal = LINK_GUEST;
333                         else if (streq(optarg, "host"))
334                                 arg_link_journal = LINK_HOST;
335                         else {
336                                 log_error("Failed to parse link journal mode %s", optarg);
337                                 return -EINVAL;
338                         }
339
340                         break;
341
342                 case ARG_BIND:
343                 case ARG_BIND_RO: {
344                         _cleanup_free_ char *a = NULL, *b = NULL;
345                         char *e;
346                         char ***x;
347
348                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
349
350                         e = strchr(optarg, ':');
351                         if (e) {
352                                 a = strndup(optarg, e - optarg);
353                                 b = strdup(e + 1);
354                         } else {
355                                 a = strdup(optarg);
356                                 b = strdup(optarg);
357                         }
358
359                         if (!a || !b)
360                                 return log_oom();
361
362                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
363                                 log_error("Invalid bind mount specification: %s", optarg);
364                                 return -EINVAL;
365                         }
366
367                         r = strv_extend(x, a);
368                         if (r < 0)
369                                 return log_oom();
370
371                         r = strv_extend(x, b);
372                         if (r < 0)
373                                 return log_oom();
374
375                         break;
376                 }
377
378                 case ARG_SETENV: {
379                         char **n;
380
381                         if (!env_assignment_is_valid(optarg)) {
382                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
383                                 return -EINVAL;
384                         }
385
386                         n = strv_env_set(arg_setenv, optarg);
387                         if (!n)
388                                 return log_oom();
389
390                         strv_free(arg_setenv);
391                         arg_setenv = n;
392                         break;
393                 }
394
395                 case 'q':
396                         arg_quiet = true;
397                         break;
398
399                 case ARG_SHARE_SYSTEM:
400                         arg_share_system = true;
401                         break;
402
403                 case ARG_REGISTER:
404                         r = parse_boolean(optarg);
405                         if (r < 0) {
406                                 log_error("Failed to parse --register= argument: %s", optarg);
407                                 return r;
408                         }
409
410                         arg_register = r;
411                         break;
412
413                 case '?':
414                         return -EINVAL;
415
416                 default:
417                         assert_not_reached("Unhandled option");
418                 }
419         }
420
421         if (arg_share_system)
422                 arg_register = false;
423
424         if (arg_boot && arg_share_system) {
425                 log_error("--boot and --share-system may not be combined.");
426                 return -EINVAL;
427         }
428
429         return 1;
430 }
431
432 static int mount_all(const char *dest) {
433
434         typedef struct MountPoint {
435                 const char *what;
436                 const char *where;
437                 const char *type;
438                 const char *options;
439                 unsigned long flags;
440                 bool fatal;
441         } MountPoint;
442
443         static const MountPoint mount_table[] = {
444                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
445                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
446                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
447                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
448                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
449                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
450                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
451                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
452 #ifdef HAVE_SELINUX
453                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
454                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
455 #endif
456         };
457
458         unsigned k;
459         int r = 0;
460
461         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
462                 _cleanup_free_ char *where = NULL;
463 #ifdef HAVE_SELINUX
464                 _cleanup_free_ char *options = NULL;
465 #endif
466                 const char *o;
467                 int t;
468
469                 where = strjoin(dest, "/", mount_table[k].where, NULL);
470                 if (!where)
471                         return log_oom();
472
473                 t = path_is_mount_point(where, true);
474                 if (t < 0) {
475                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
476
477                         if (r == 0)
478                                 r = t;
479
480                         continue;
481                 }
482
483                 /* Skip this entry if it is not a remount. */
484                 if (mount_table[k].what && t > 0)
485                         continue;
486
487                 mkdir_p(where, 0755);
488
489 #ifdef HAVE_SELINUX
490                 if (arg_selinux_apifs_context &&
491                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
492                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
493                         if (!options)
494                                 return log_oom();
495
496                         o = options;
497                 } else
498 #endif
499                         o = mount_table[k].options;
500
501
502                 if (mount(mount_table[k].what,
503                           where,
504                           mount_table[k].type,
505                           mount_table[k].flags,
506                           o) < 0 &&
507                     mount_table[k].fatal) {
508
509                         log_error("mount(%s) failed: %m", where);
510
511                         if (r == 0)
512                                 r = -errno;
513                 }
514         }
515
516         return r;
517 }
518
519 static int mount_binds(const char *dest, char **l, unsigned long flags) {
520         char **x, **y;
521
522         STRV_FOREACH_PAIR(x, y, l) {
523                 char *where;
524                 struct stat source_st, dest_st;
525                 int r;
526
527                 if (stat(*x, &source_st) < 0) {
528                         log_error("failed to stat %s: %m", *x);
529                         return -errno;
530                 }
531
532                 where = strappenda(dest, *y);
533                 r = stat(where, &dest_st);
534                 if (r == 0) {
535                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
536                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
537                                                 *x, where);
538                                 return -EINVAL;
539                         }
540                 } else if (errno == ENOENT) {
541                         r = mkdir_parents_label(where, 0755);
542                         if (r < 0) {
543                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
544                                 return r;
545                         }
546                 } else {
547                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
548                         return -errno;
549                 }
550                 /* Create the mount point, but be conservative -- refuse to create block
551                 * and char devices. */
552                 if (S_ISDIR(source_st.st_mode))
553                         mkdir_label(where, 0755);
554                 else if (S_ISFIFO(source_st.st_mode))
555                         mkfifo(where, 0644);
556                 else if (S_ISSOCK(source_st.st_mode))
557                         mknod(where, 0644 | S_IFSOCK, 0);
558                 else if (S_ISREG(source_st.st_mode))
559                         touch(where);
560                 else {
561                         log_error("Refusing to create mountpoint for file: %s", *x);
562                         return -ENOTSUP;
563                 }
564
565                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
566                         log_error("mount(%s) failed: %m", where);
567                         return -errno;
568                 }
569
570                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
571                         log_error("mount(%s) failed: %m", where);
572                         return -errno;
573                 }
574         }
575
576         return 0;
577 }
578
579 static int setup_timezone(const char *dest) {
580         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
581         char *z, *y;
582         int r;
583
584         assert(dest);
585
586         /* Fix the timezone, if possible */
587         r = readlink_malloc("/etc/localtime", &p);
588         if (r < 0) {
589                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
590                 return 0;
591         }
592
593         z = path_startswith(p, "../usr/share/zoneinfo/");
594         if (!z)
595                 z = path_startswith(p, "/usr/share/zoneinfo/");
596         if (!z) {
597                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
598                 return 0;
599         }
600
601         where = strappend(dest, "/etc/localtime");
602         if (!where)
603                 return log_oom();
604
605         r = readlink_malloc(where, &q);
606         if (r >= 0) {
607                 y = path_startswith(q, "../usr/share/zoneinfo/");
608                 if (!y)
609                         y = path_startswith(q, "/usr/share/zoneinfo/");
610
611
612                 /* Already pointing to the right place? Then do nothing .. */
613                 if (y && streq(y, z))
614                         return 0;
615         }
616
617         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
618         if (!check)
619                 return log_oom();
620
621         if (access(check, F_OK) < 0) {
622                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
623                 return 0;
624         }
625
626         what = strappend("../usr/share/zoneinfo/", z);
627         if (!what)
628                 return log_oom();
629
630         unlink(where);
631         if (symlink(what, where) < 0) {
632                 log_error("Failed to correct timezone of container: %m");
633                 return 0;
634         }
635
636         return 0;
637 }
638
639 static int setup_resolv_conf(const char *dest) {
640         char _cleanup_free_ *where = NULL;
641
642         assert(dest);
643
644         if (arg_private_network)
645                 return 0;
646
647         /* Fix resolv.conf, if possible */
648         where = strappend(dest, "/etc/resolv.conf");
649         if (!where)
650                 return log_oom();
651
652         /* We don't really care for the results of this really. If it
653          * fails, it fails, but meh... */
654         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
655
656         return 0;
657 }
658
659 static int setup_boot_id(const char *dest) {
660         _cleanup_free_ char *from = NULL, *to = NULL;
661         sd_id128_t rnd;
662         char as_uuid[37];
663         int r;
664
665         assert(dest);
666
667         if (arg_share_system)
668                 return 0;
669
670         /* Generate a new randomized boot ID, so that each boot-up of
671          * the container gets a new one */
672
673         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
674         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
675         if (!from || !to)
676                 return log_oom();
677
678         r = sd_id128_randomize(&rnd);
679         if (r < 0) {
680                 log_error("Failed to generate random boot id: %s", strerror(-r));
681                 return r;
682         }
683
684         snprintf(as_uuid, sizeof(as_uuid),
685                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
686                  SD_ID128_FORMAT_VAL(rnd));
687         char_array_0(as_uuid);
688
689         r = write_string_file(from, as_uuid);
690         if (r < 0) {
691                 log_error("Failed to write boot id: %s", strerror(-r));
692                 return r;
693         }
694
695         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
696                 log_error("Failed to bind mount boot id: %m");
697                 r = -errno;
698         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
699                 log_warning("Failed to make boot id read-only: %m");
700
701         unlink(from);
702         return r;
703 }
704
705 static int copy_devnodes(const char *dest) {
706
707         static const char devnodes[] =
708                 "null\0"
709                 "zero\0"
710                 "full\0"
711                 "random\0"
712                 "urandom\0"
713                 "tty\0";
714
715         const char *d;
716         int r = 0;
717         _cleanup_umask_ mode_t u;
718
719         assert(dest);
720
721         u = umask(0000);
722
723         NULSTR_FOREACH(d, devnodes) {
724                 _cleanup_free_ char *from = NULL, *to = NULL;
725                 struct stat st;
726
727                 from = strappend("/dev/", d);
728                 to = strjoin(dest, "/dev/", d, NULL);
729                 if (!from || !to)
730                         return log_oom();
731
732                 if (stat(from, &st) < 0) {
733
734                         if (errno != ENOENT) {
735                                 log_error("Failed to stat %s: %m", from);
736                                 return -errno;
737                         }
738
739                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
740
741                         log_error("%s is not a char or block device, cannot copy", from);
742                         return -EIO;
743
744                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
745
746                         log_error("mknod(%s) failed: %m", dest);
747                         return  -errno;
748                 }
749         }
750
751         return r;
752 }
753
754 static int setup_ptmx(const char *dest) {
755         _cleanup_free_ char *p = NULL;
756
757         p = strappend(dest, "/dev/ptmx");
758         if (!p)
759                 return log_oom();
760
761         if (symlink("pts/ptmx", p) < 0) {
762                 log_error("Failed to create /dev/ptmx symlink: %m");
763                 return -errno;
764         }
765
766         return 0;
767 }
768
769 static int setup_dev_console(const char *dest, const char *console) {
770         struct stat st;
771         _cleanup_free_ char *to = NULL;
772         int r;
773         _cleanup_umask_ mode_t u;
774
775         assert(dest);
776         assert(console);
777
778         u = umask(0000);
779
780         if (stat(console, &st) < 0) {
781                 log_error("Failed to stat %s: %m", console);
782                 return -errno;
783
784         } else if (!S_ISCHR(st.st_mode)) {
785                 log_error("/dev/console is not a char device");
786                 return -EIO;
787         }
788
789         r = chmod_and_chown(console, 0600, 0, 0);
790         if (r < 0) {
791                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
792                 return r;
793         }
794
795         if (asprintf(&to, "%s/dev/console", dest) < 0)
796                 return log_oom();
797
798         /* We need to bind mount the right tty to /dev/console since
799          * ptys can only exist on pts file systems. To have something
800          * to bind mount things on we create a device node first, that
801          * has the right major/minor (note that the major minor
802          * doesn't actually matter here, since we mount it over
803          * anyway). */
804
805         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
806                 log_error("mknod() for /dev/console failed: %m");
807                 return -errno;
808         }
809
810         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
811                 log_error("Bind mount for /dev/console failed: %m");
812                 return -errno;
813         }
814
815         return 0;
816 }
817
818 static int setup_kmsg(const char *dest, int kmsg_socket) {
819         _cleanup_free_ char *from = NULL, *to = NULL;
820         int r, fd, k;
821         _cleanup_umask_ mode_t u;
822         union {
823                 struct cmsghdr cmsghdr;
824                 uint8_t buf[CMSG_SPACE(sizeof(int))];
825         } control = {};
826         struct msghdr mh = {
827                 .msg_control = &control,
828                 .msg_controllen = sizeof(control),
829         };
830         struct cmsghdr *cmsg;
831
832         assert(dest);
833         assert(kmsg_socket >= 0);
834
835         u = umask(0000);
836
837         /* We create the kmsg FIFO as /dev/kmsg, but immediately
838          * delete it after bind mounting it to /proc/kmsg. While FIFOs
839          * on the reading side behave very similar to /proc/kmsg,
840          * their writing side behaves differently from /dev/kmsg in
841          * that writing blocks when nothing is reading. In order to
842          * avoid any problems with containers deadlocking due to this
843          * we simply make /dev/kmsg unavailable to the container. */
844         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
845             asprintf(&to, "%s/proc/kmsg", dest) < 0)
846                 return log_oom();
847
848         if (mkfifo(from, 0600) < 0) {
849                 log_error("mkfifo() for /dev/kmsg failed: %m");
850                 return -errno;
851         }
852
853         r = chmod_and_chown(from, 0600, 0, 0);
854         if (r < 0) {
855                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
856                 return r;
857         }
858
859         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
860                 log_error("Bind mount for /proc/kmsg failed: %m");
861                 return -errno;
862         }
863
864         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
865         if (fd < 0) {
866                 log_error("Failed to open fifo: %m");
867                 return -errno;
868         }
869
870         cmsg = CMSG_FIRSTHDR(&mh);
871         cmsg->cmsg_level = SOL_SOCKET;
872         cmsg->cmsg_type = SCM_RIGHTS;
873         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
874         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
875
876         mh.msg_controllen = cmsg->cmsg_len;
877
878         /* Store away the fd in the socket, so that it stays open as
879          * long as we run the child */
880         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
881         close_nointr_nofail(fd);
882
883         if (k < 0) {
884                 log_error("Failed to send FIFO fd: %m");
885                 return -errno;
886         }
887
888         /* And now make the FIFO unavailable as /dev/kmsg... */
889         unlink(from);
890         return 0;
891 }
892
893 static int setup_hostname(void) {
894
895         if (arg_share_system)
896                 return 0;
897
898         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
899                 return -errno;
900
901         return 0;
902 }
903
904 static int setup_journal(const char *directory) {
905         sd_id128_t machine_id, this_id;
906         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
907         char *id;
908         int r;
909
910         p = strappend(directory, "/etc/machine-id");
911         if (!p)
912                 return log_oom();
913
914         r = read_one_line_file(p, &b);
915         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
916                 return 0;
917         else if (r < 0) {
918                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
919                 return r;
920         }
921
922         id = strstrip(b);
923         if (isempty(id) && arg_link_journal == LINK_AUTO)
924                 return 0;
925
926         /* Verify validity */
927         r = sd_id128_from_string(id, &machine_id);
928         if (r < 0) {
929                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
930                 return r;
931         }
932
933         r = sd_id128_get_machine(&this_id);
934         if (r < 0) {
935                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
936                 return r;
937         }
938
939         if (sd_id128_equal(machine_id, this_id)) {
940                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
941                          "Host and machine ids are equal (%s): refusing to link journals", id);
942                 if (arg_link_journal == LINK_AUTO)
943                         return 0;
944                 return
945                         -EEXIST;
946         }
947
948         if (arg_link_journal == LINK_NO)
949                 return 0;
950
951         free(p);
952         p = strappend("/var/log/journal/", id);
953         q = strjoin(directory, "/var/log/journal/", id, NULL);
954         if (!p || !q)
955                 return log_oom();
956
957         if (path_is_mount_point(p, false) > 0) {
958                 if (arg_link_journal != LINK_AUTO) {
959                         log_error("%s: already a mount point, refusing to use for journal", p);
960                         return -EEXIST;
961                 }
962
963                 return 0;
964         }
965
966         if (path_is_mount_point(q, false) > 0) {
967                 if (arg_link_journal != LINK_AUTO) {
968                         log_error("%s: already a mount point, refusing to use for journal", q);
969                         return -EEXIST;
970                 }
971
972                 return 0;
973         }
974
975         r = readlink_and_make_absolute(p, &d);
976         if (r >= 0) {
977                 if ((arg_link_journal == LINK_GUEST ||
978                      arg_link_journal == LINK_AUTO) &&
979                     path_equal(d, q)) {
980
981                         r = mkdir_p(q, 0755);
982                         if (r < 0)
983                                 log_warning("failed to create directory %s: %m", q);
984                         return 0;
985                 }
986
987                 if (unlink(p) < 0) {
988                         log_error("Failed to remove symlink %s: %m", p);
989                         return -errno;
990                 }
991         } else if (r == -EINVAL) {
992
993                 if (arg_link_journal == LINK_GUEST &&
994                     rmdir(p) < 0) {
995
996                         if (errno == ENOTDIR) {
997                                 log_error("%s already exists and is neither a symlink nor a directory", p);
998                                 return r;
999                         } else {
1000                                 log_error("Failed to remove %s: %m", p);
1001                                 return -errno;
1002                         }
1003                 }
1004         } else if (r != -ENOENT) {
1005                 log_error("readlink(%s) failed: %m", p);
1006                 return r;
1007         }
1008
1009         if (arg_link_journal == LINK_GUEST) {
1010
1011                 if (symlink(q, p) < 0) {
1012                         log_error("Failed to symlink %s to %s: %m", q, p);
1013                         return -errno;
1014                 }
1015
1016                 r = mkdir_p(q, 0755);
1017                 if (r < 0)
1018                         log_warning("failed to create directory %s: %m", q);
1019                 return 0;
1020         }
1021
1022         if (arg_link_journal == LINK_HOST) {
1023                 r = mkdir_p(p, 0755);
1024                 if (r < 0) {
1025                         log_error("Failed to create %s: %m", p);
1026                         return r;
1027                 }
1028
1029         } else if (access(p, F_OK) < 0)
1030                 return 0;
1031
1032         if (dir_is_empty(q) == 0) {
1033                 log_error("%s not empty.", q);
1034                 return -ENOTEMPTY;
1035         }
1036
1037         r = mkdir_p(q, 0755);
1038         if (r < 0) {
1039                 log_error("Failed to create %s: %m", q);
1040                 return r;
1041         }
1042
1043         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1044                 log_error("Failed to bind mount journal from host into guest: %m");
1045                 return -errno;
1046         }
1047
1048         return 0;
1049 }
1050
1051 static int setup_kdbus(const char *dest, const char *path) {
1052         const char *p;
1053
1054         if (!path)
1055                 return 0;
1056
1057         p = strappenda(dest, "/dev/kdbus");
1058         if (mkdir(p, 0755) < 0) {
1059                 log_error("Failed to create kdbus path: %m");
1060                 return  -errno;
1061         }
1062
1063         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1064                 log_error("Failed to mount kdbus domain path: %m");
1065                 return -errno;
1066         }
1067
1068         return 0;
1069 }
1070
1071 static int drop_capabilities(void) {
1072         return capability_bounding_set_drop(~arg_retain, false);
1073 }
1074
1075 static int register_machine(pid_t pid) {
1076         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1077         _cleanup_bus_unref_ sd_bus *bus = NULL;
1078         int r;
1079
1080         if (!arg_register)
1081                 return 0;
1082
1083         r = sd_bus_default_system(&bus);
1084         if (r < 0) {
1085                 log_error("Failed to open system bus: %s", strerror(-r));
1086                 return r;
1087         }
1088
1089         r = sd_bus_call_method(
1090                         bus,
1091                         "org.freedesktop.machine1",
1092                         "/org/freedesktop/machine1",
1093                         "org.freedesktop.machine1.Manager",
1094                         "CreateMachine",
1095                         &error,
1096                         NULL,
1097                         "sayssusa(sv)",
1098                         arg_machine,
1099                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1100                         "nspawn",
1101                         "container",
1102                         (uint32_t) pid,
1103                         strempty(arg_directory),
1104                         !isempty(arg_slice), "Slice", "s", arg_slice);
1105         if (r < 0) {
1106                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1107                 return r;
1108         }
1109
1110         return 0;
1111 }
1112
1113 static int terminate_machine(pid_t pid) {
1114         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1115         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1116         _cleanup_bus_unref_ sd_bus *bus = NULL;
1117         const char *path;
1118         int r;
1119
1120         if (!arg_register)
1121                 return 0;
1122
1123         r = sd_bus_default_system(&bus);
1124         if (r < 0) {
1125                 log_error("Failed to open system bus: %s", strerror(-r));
1126                 return r;
1127         }
1128
1129         r = sd_bus_call_method(
1130                         bus,
1131                         "org.freedesktop.machine1",
1132                         "/org/freedesktop/machine1",
1133                         "org.freedesktop.machine1.Manager",
1134                         "GetMachineByPID",
1135                         &error,
1136                         &reply,
1137                         "u",
1138                         (uint32_t) pid);
1139         if (r < 0) {
1140                 /* Note that the machine might already have been
1141                  * cleaned up automatically, hence don't consider it a
1142                  * failure if we cannot get the machine object. */
1143                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1144                 return 0;
1145         }
1146
1147         r = sd_bus_message_read(reply, "o", &path);
1148         if (r < 0)
1149                 return bus_log_parse_error(r);
1150
1151         r = sd_bus_call_method(
1152                         bus,
1153                         "org.freedesktop.machine1",
1154                         path,
1155                         "org.freedesktop.machine1.Machine",
1156                         "Terminate",
1157                         &error,
1158                         NULL,
1159                         NULL);
1160         if (r < 0) {
1161                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1162                 return 0;
1163         }
1164
1165         return 0;
1166 }
1167
1168 static bool audit_enabled(void) {
1169         int fd;
1170
1171         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1172         if (fd >= 0) {
1173                 close_nointr_nofail(fd);
1174                 return true;
1175         }
1176         return false;
1177 }
1178
1179 int main(int argc, char *argv[]) {
1180         pid_t pid = 0;
1181         int r = EXIT_FAILURE, k;
1182         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1183         int n_fd_passed;
1184         const char *console = NULL;
1185         sigset_t mask;
1186         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1187         _cleanup_fdset_free_ FDSet *fds = NULL;
1188         _cleanup_free_ char *kdbus_domain = NULL;
1189
1190         log_parse_environment();
1191         log_open();
1192
1193         k = parse_argv(argc, argv);
1194         if (k < 0)
1195                 goto finish;
1196         else if (k == 0) {
1197                 r = EXIT_SUCCESS;
1198                 goto finish;
1199         }
1200
1201         if (arg_directory) {
1202                 char *p;
1203
1204                 p = path_make_absolute_cwd(arg_directory);
1205                 free(arg_directory);
1206                 arg_directory = p;
1207         } else
1208                 arg_directory = get_current_dir_name();
1209
1210         if (!arg_directory) {
1211                 log_error("Failed to determine path, please use -D.");
1212                 goto finish;
1213         }
1214
1215         path_kill_slashes(arg_directory);
1216
1217         if (!arg_machine) {
1218                 arg_machine = strdup(basename(arg_directory));
1219                 if (!arg_machine) {
1220                         log_oom();
1221                         goto finish;
1222                 }
1223
1224                 hostname_cleanup(arg_machine, false);
1225                 if (isempty(arg_machine)) {
1226                         log_error("Failed to determine machine name automatically, please use -M.");
1227                         goto finish;
1228                 }
1229         }
1230
1231         if (geteuid() != 0) {
1232                 log_error("Need to be root.");
1233                 goto finish;
1234         }
1235
1236         if (sd_booted() <= 0) {
1237                 log_error("Not running on a systemd system.");
1238                 goto finish;
1239         }
1240
1241         if (arg_boot && audit_enabled()) {
1242                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1243                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1244                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1245                 sleep(5);
1246         }
1247
1248         if (path_equal(arg_directory, "/")) {
1249                 log_error("Spawning container on root directory not supported.");
1250                 goto finish;
1251         }
1252
1253         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1254                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1255                 goto finish;
1256         }
1257
1258         log_close();
1259         n_fd_passed = sd_listen_fds(false);
1260         if (n_fd_passed > 0) {
1261                 k = fdset_new_listen_fds(&fds, false);
1262                 if (k < 0) {
1263                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1264                         goto finish;
1265                 }
1266         }
1267         fdset_close_others(fds);
1268         log_open();
1269
1270         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1271         if (master < 0) {
1272                 log_error("Failed to acquire pseudo tty: %m");
1273                 goto finish;
1274         }
1275
1276         console = ptsname(master);
1277         if (!console) {
1278                 log_error("Failed to determine tty name: %m");
1279                 goto finish;
1280         }
1281
1282         if (!arg_quiet)
1283                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1284
1285         if (unlockpt(master) < 0) {
1286                 log_error("Failed to unlock tty: %m");
1287                 goto finish;
1288         }
1289
1290
1291         if (access("/dev/kdbus/control", F_OK) >= 0) {
1292
1293                 if (arg_share_system) {
1294                         kdbus_domain = strdup("/dev/kdbus");
1295                         if (!kdbus_domain) {
1296                                 log_oom();
1297                                 goto finish;
1298                         }
1299                 } else {
1300                         const char *ns;
1301
1302                         ns = strappenda("machine-", arg_machine);
1303                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1304                         if (r < 0)
1305                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1306                         else
1307                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1308                 }
1309         }
1310
1311         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1312                 log_error("Failed to create kmsg socket pair: %m");
1313                 goto finish;
1314         }
1315
1316         sd_notify(0, "READY=1");
1317
1318         assert_se(sigemptyset(&mask) == 0);
1319         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1320         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1321
1322         for (;;) {
1323                 siginfo_t status;
1324
1325                 sync_fd = eventfd(0, EFD_CLOEXEC);
1326                 if (sync_fd < 0) {
1327                         log_error("Failed to create event fd: %m");
1328                         goto finish;
1329                 }
1330
1331                 pid = syscall(__NR_clone,
1332                               SIGCHLD|CLONE_NEWNS|
1333                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1334                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1335                 if (pid < 0) {
1336                         if (errno == EINVAL)
1337                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1338                         else
1339                                 log_error("clone() failed: %m");
1340
1341                         goto finish;
1342                 }
1343
1344                 if (pid == 0) {
1345                         /* child */
1346                         const char *home = NULL;
1347                         uid_t uid = (uid_t) -1;
1348                         gid_t gid = (gid_t) -1;
1349                         unsigned n_env = 2;
1350                         const char *envp[] = {
1351                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1352                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1353                                 NULL, /* TERM */
1354                                 NULL, /* HOME */
1355                                 NULL, /* USER */
1356                                 NULL, /* LOGNAME */
1357                                 NULL, /* container_uuid */
1358                                 NULL, /* LISTEN_FDS */
1359                                 NULL, /* LISTEN_PID */
1360                                 NULL
1361                         };
1362                         char **env_use;
1363                         eventfd_t x;
1364
1365                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1366                         if (envp[n_env])
1367                                 n_env ++;
1368
1369                         close_nointr_nofail(master);
1370                         master = -1;
1371
1372                         close_nointr(STDIN_FILENO);
1373                         close_nointr(STDOUT_FILENO);
1374                         close_nointr(STDERR_FILENO);
1375
1376                         close_nointr_nofail(kmsg_socket_pair[0]);
1377                         kmsg_socket_pair[0] = -1;
1378
1379                         reset_all_signal_handlers();
1380
1381                         assert_se(sigemptyset(&mask) == 0);
1382                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1383
1384                         k = open_terminal(console, O_RDWR);
1385                         if (k != STDIN_FILENO) {
1386                                 if (k >= 0) {
1387                                         close_nointr_nofail(k);
1388                                         k = -EINVAL;
1389                                 }
1390
1391                                 log_error("Failed to open console: %s", strerror(-k));
1392                                 goto child_fail;
1393                         }
1394
1395                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1396                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1397                                 log_error("Failed to duplicate console: %m");
1398                                 goto child_fail;
1399                         }
1400
1401                         if (setsid() < 0) {
1402                                 log_error("setsid() failed: %m");
1403                                 goto child_fail;
1404                         }
1405
1406                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1407                                 log_error("PR_SET_PDEATHSIG failed: %m");
1408                                 goto child_fail;
1409                         }
1410
1411                         /* Mark everything as slave, so that we still
1412                          * receive mounts from the real root, but don't
1413                          * propagate mounts to the real root. */
1414                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1415                                 log_error("MS_SLAVE|MS_REC failed: %m");
1416                                 goto child_fail;
1417                         }
1418
1419                         /* Turn directory into bind mount */
1420                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1421                                 log_error("Failed to make bind mount.");
1422                                 goto child_fail;
1423                         }
1424
1425                         if (arg_read_only)
1426                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1427                                         log_error("Failed to make read-only.");
1428                                         goto child_fail;
1429                                 }
1430
1431                         if (mount_all(arg_directory) < 0)
1432                                 goto child_fail;
1433
1434                         if (copy_devnodes(arg_directory) < 0)
1435                                 goto child_fail;
1436
1437                         if (setup_ptmx(arg_directory) < 0)
1438                                 goto child_fail;
1439
1440                         dev_setup(arg_directory);
1441
1442                         if (setup_dev_console(arg_directory, console) < 0)
1443                                 goto child_fail;
1444
1445                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1446                                 goto child_fail;
1447
1448                         close_nointr_nofail(kmsg_socket_pair[1]);
1449                         kmsg_socket_pair[1] = -1;
1450
1451                         if (setup_boot_id(arg_directory) < 0)
1452                                 goto child_fail;
1453
1454                         if (setup_timezone(arg_directory) < 0)
1455                                 goto child_fail;
1456
1457                         if (setup_resolv_conf(arg_directory) < 0)
1458                                 goto child_fail;
1459
1460                         if (setup_journal(arg_directory) < 0)
1461                                 goto child_fail;
1462
1463                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1464                                 goto child_fail;
1465
1466                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1467                                 goto child_fail;
1468
1469                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1470                                 goto child_fail;
1471
1472                         if (chdir(arg_directory) < 0) {
1473                                 log_error("chdir(%s) failed: %m", arg_directory);
1474                                 goto child_fail;
1475                         }
1476
1477                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1478                                 log_error("mount(MS_MOVE) failed: %m");
1479                                 goto child_fail;
1480                         }
1481
1482                         if (chroot(".") < 0) {
1483                                 log_error("chroot() failed: %m");
1484                                 goto child_fail;
1485                         }
1486
1487                         if (chdir("/") < 0) {
1488                                 log_error("chdir() failed: %m");
1489                                 goto child_fail;
1490                         }
1491
1492                         umask(0022);
1493
1494                         if (arg_private_network)
1495                                 loopback_setup();
1496
1497                         if (drop_capabilities() < 0) {
1498                                 log_error("drop_capabilities() failed: %m");
1499                                 goto child_fail;
1500                         }
1501
1502                         if (arg_user) {
1503
1504                                 /* Note that this resolves user names
1505                                  * inside the container, and hence
1506                                  * accesses the NSS modules from the
1507                                  * container and not the host. This is
1508                                  * a bit weird... */
1509
1510                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1511                                         log_error("get_user_creds() failed: %m");
1512                                         goto child_fail;
1513                                 }
1514
1515                                 if (mkdir_parents_label(home, 0775) < 0) {
1516                                         log_error("mkdir_parents_label() failed: %m");
1517                                         goto child_fail;
1518                                 }
1519
1520                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1521                                         log_error("mkdir_safe_label() failed: %m");
1522                                         goto child_fail;
1523                                 }
1524
1525                                 if (initgroups((const char*)arg_user, gid) < 0) {
1526                                         log_error("initgroups() failed: %m");
1527                                         goto child_fail;
1528                                 }
1529
1530                                 if (setresgid(gid, gid, gid) < 0) {
1531                                         log_error("setregid() failed: %m");
1532                                         goto child_fail;
1533                                 }
1534
1535                                 if (setresuid(uid, uid, uid) < 0) {
1536                                         log_error("setreuid() failed: %m");
1537                                         goto child_fail;
1538                                 }
1539                         } else {
1540                                 /* Reset everything fully to 0, just in case */
1541
1542                                 if (setgroups(0, NULL) < 0) {
1543                                         log_error("setgroups() failed: %m");
1544                                         goto child_fail;
1545                                 }
1546
1547                                 if (setresgid(0, 0, 0) < 0) {
1548                                         log_error("setregid() failed: %m");
1549                                         goto child_fail;
1550                                 }
1551
1552                                 if (setresuid(0, 0, 0) < 0) {
1553                                         log_error("setreuid() failed: %m");
1554                                         goto child_fail;
1555                                 }
1556                         }
1557
1558                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1559                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1560                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1561                                 log_oom();
1562                                 goto child_fail;
1563                         }
1564
1565                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1566                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1567                                         log_oom();
1568                                         goto child_fail;
1569                                 }
1570                         }
1571
1572                         if (fdset_size(fds) > 0) {
1573                                 k = fdset_cloexec(fds, false);
1574                                 if (k < 0) {
1575                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1576                                         goto child_fail;
1577                                 }
1578
1579                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1580                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1581                                         log_oom();
1582                                         goto child_fail;
1583                                 }
1584                         }
1585
1586                         setup_hostname();
1587
1588                         eventfd_read(sync_fd, &x);
1589                         close_nointr_nofail(sync_fd);
1590                         sync_fd = -1;
1591
1592                         if (!strv_isempty(arg_setenv)) {
1593                                 char **n;
1594
1595                                 n = strv_env_merge(2, envp, arg_setenv);
1596                                 if (!n) {
1597                                         log_oom();
1598                                         goto child_fail;
1599                                 }
1600
1601                                 env_use = n;
1602                         } else
1603                                 env_use = (char**) envp;
1604
1605 #ifdef HAVE_SELINUX
1606                         if (arg_selinux_context)
1607                                 if (setexeccon(arg_selinux_context) < 0)
1608                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1609 #endif
1610                         if (arg_boot) {
1611                                 char **a;
1612                                 size_t l;
1613
1614                                 /* Automatically search for the init system */
1615
1616                                 l = 1 + argc - optind;
1617                                 a = newa(char*, l + 1);
1618                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1619
1620                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1621                                 execve(a[0], a, env_use);
1622
1623                                 a[0] = (char*) "/lib/systemd/systemd";
1624                                 execve(a[0], a, env_use);
1625
1626                                 a[0] = (char*) "/sbin/init";
1627                                 execve(a[0], a, env_use);
1628                         } else if (argc > optind)
1629                                 execvpe(argv[optind], argv + optind, env_use);
1630                         else {
1631                                 chdir(home ? home : "/root");
1632                                 execle("/bin/bash", "-bash", NULL, env_use);
1633                         }
1634
1635                         log_error("execv() failed: %m");
1636
1637                 child_fail:
1638                         _exit(EXIT_FAILURE);
1639                 }
1640
1641                 fdset_free(fds);
1642                 fds = NULL;
1643
1644                 r = register_machine(pid);
1645                 if (r < 0)
1646                         goto finish;
1647
1648                 eventfd_write(sync_fd, 1);
1649                 close_nointr_nofail(sync_fd);
1650                 sync_fd = -1;
1651
1652                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1653                 if (k < 0) {
1654                         r = EXIT_FAILURE;
1655                         break;
1656                 }
1657
1658                 if (!arg_quiet)
1659                         putc('\n', stdout);
1660
1661                 /* Kill if it is not dead yet anyway */
1662                 terminate_machine(pid);
1663
1664                 /* Redundant, but better safe than sorry */
1665                 kill(pid, SIGKILL);
1666
1667                 k = wait_for_terminate(pid, &status);
1668                 pid = 0;
1669
1670                 if (k < 0) {
1671                         r = EXIT_FAILURE;
1672                         break;
1673                 }
1674
1675                 if (status.si_code == CLD_EXITED) {
1676                         r = status.si_status;
1677                         if (status.si_status != 0) {
1678                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1679                                 break;
1680                         }
1681
1682                         if (!arg_quiet)
1683                                 log_debug("Container %s exited successfully.", arg_machine);
1684                         break;
1685                 } else if (status.si_code == CLD_KILLED &&
1686                            status.si_status == SIGINT) {
1687
1688                         if (!arg_quiet)
1689                                 log_info("Container %s has been shut down.", arg_machine);
1690                         r = 0;
1691                         break;
1692                 } else if (status.si_code == CLD_KILLED &&
1693                            status.si_status == SIGHUP) {
1694
1695                         if (!arg_quiet)
1696                                 log_info("Container %s is being rebooted.", arg_machine);
1697                         continue;
1698                 } else if (status.si_code == CLD_KILLED ||
1699                            status.si_code == CLD_DUMPED) {
1700
1701                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1702                         r = EXIT_FAILURE;
1703                         break;
1704                 } else {
1705                         log_error("Container %s failed due to unknown reason.", arg_machine);
1706                         r = EXIT_FAILURE;
1707                         break;
1708                 }
1709         }
1710
1711 finish:
1712         if (pid > 0)
1713                 kill(pid, SIGKILL);
1714
1715         free(arg_directory);
1716         free(arg_machine);
1717         free(arg_setenv);
1718
1719         return r;
1720 }