chiark / gitweb /
nspawn: introduce --capability=all for retaining all capabilities
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #ifdef HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #include "sd-daemon.h"
49 #include "sd-bus.h"
50 #include "sd-id128.h"
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
62 #include "fdset.h"
63 #include "build.h"
64 #include "fileio.h"
65 #include "bus-util.h"
66 #include "bus-error.h"
67 #include "ptyfwd.h"
68 #include "bus-kernel.h"
69 #include "env-util.h"
70 #include "def.h"
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
91         (1ULL << CAP_CHOWN) |
92         (1ULL << CAP_DAC_OVERRIDE) |
93         (1ULL << CAP_DAC_READ_SEARCH) |
94         (1ULL << CAP_FOWNER) |
95         (1ULL << CAP_FSETID) |
96         (1ULL << CAP_IPC_OWNER) |
97         (1ULL << CAP_KILL) |
98         (1ULL << CAP_LEASE) |
99         (1ULL << CAP_LINUX_IMMUTABLE) |
100         (1ULL << CAP_NET_BIND_SERVICE) |
101         (1ULL << CAP_NET_BROADCAST) |
102         (1ULL << CAP_NET_RAW) |
103         (1ULL << CAP_SETGID) |
104         (1ULL << CAP_SETFCAP) |
105         (1ULL << CAP_SETPCAP) |
106         (1ULL << CAP_SETUID) |
107         (1ULL << CAP_SYS_ADMIN) |
108         (1ULL << CAP_SYS_CHROOT) |
109         (1ULL << CAP_SYS_NICE) |
110         (1ULL << CAP_SYS_PTRACE) |
111         (1ULL << CAP_SYS_TTY_CONFIG) |
112         (1ULL << CAP_SYS_RESOURCE) |
113         (1ULL << CAP_SYS_BOOT) |
114         (1ULL << CAP_AUDIT_WRITE) |
115         (1ULL << CAP_AUDIT_CONTROL) |
116         (1ULL << CAP_MKNOD);
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122 static bool arg_register = true;
123 static bool arg_keep_unit = false;
124
125 static int help(void) {
126
127         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
128                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
129                "  -h --help                 Show this help\n"
130                "     --version              Print version string\n"
131                "  -D --directory=NAME       Root directory for the container\n"
132                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
133                "  -u --user=USER            Run the command under specified user or uid\n"
134                "     --uuid=UUID            Set a specific machine UUID for the container\n"
135                "  -M --machine=NAME         Set the machine name for the container\n"
136                "  -S --slice=SLICE          Place the container in the specified slice\n"
137                "  -Z --selinux-context=SECLABEL\n"
138                "                            Set the SELinux security context to be used by\n"
139                "                            processes in the container\n"
140                "  -L --selinux-apifs-context=SECLABEL\n"
141                "                            Set the SELinux security context to be used by\n"
142                "                            API/tmpfs file systems in the container\n"
143                "     --private-network      Disable network in container\n"
144                "     --share-system         Share system namespaces with host\n"
145                "     --read-only            Mount the root directory read-only\n"
146                "     --capability=CAP       In addition to the default, retain specified\n"
147                "                            capability\n"
148                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
149                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
150                "  -j                        Equivalent to --link-journal=host\n"
151                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
152                "                            the container\n"
153                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
154                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
155                "     --register=BOOLEAN     Register container as machine\n"
156                "     --keep-unit            Do not register a scope for the machine, reuse\n"
157                "                            the service unit nspawn is running in\n"
158                "  -q --quiet                Do not show status information\n",
159                program_invocation_short_name);
160
161         return 0;
162 }
163
164 static int parse_argv(int argc, char *argv[]) {
165
166         enum {
167                 ARG_VERSION = 0x100,
168                 ARG_PRIVATE_NETWORK,
169                 ARG_UUID,
170                 ARG_READ_ONLY,
171                 ARG_CAPABILITY,
172                 ARG_DROP_CAPABILITY,
173                 ARG_LINK_JOURNAL,
174                 ARG_BIND,
175                 ARG_BIND_RO,
176                 ARG_SETENV,
177                 ARG_SHARE_SYSTEM,
178                 ARG_REGISTER,
179                 ARG_KEEP_UNIT
180         };
181
182         static const struct option options[] = {
183                 { "help",                  no_argument,       NULL, 'h'                 },
184                 { "version",               no_argument,       NULL, ARG_VERSION         },
185                 { "directory",             required_argument, NULL, 'D'                 },
186                 { "user",                  required_argument, NULL, 'u'                 },
187                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK },
188                 { "boot",                  no_argument,       NULL, 'b'                 },
189                 { "uuid",                  required_argument, NULL, ARG_UUID            },
190                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY       },
191                 { "capability",            required_argument, NULL, ARG_CAPABILITY      },
192                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY },
193                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL    },
194                 { "bind",                  required_argument, NULL, ARG_BIND            },
195                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO         },
196                 { "machine",               required_argument, NULL, 'M'                 },
197                 { "slice",                 required_argument, NULL, 'S'                 },
198                 { "setenv",                required_argument, NULL, ARG_SETENV          },
199                 { "selinux-context",       required_argument, NULL, 'Z'                 },
200                 { "selinux-apifs-context", required_argument, NULL, 'L'                 },
201                 { "quiet",                 no_argument,       NULL, 'q'                 },
202                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM    },
203                 { "register",              required_argument, NULL, ARG_REGISTER        },
204                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT       },
205                 {}
206         };
207
208         int c, r;
209
210         assert(argc >= 0);
211         assert(argv);
212
213         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
214
215                 switch (c) {
216
217                 case 'h':
218                         return help();
219
220                 case ARG_VERSION:
221                         puts(PACKAGE_STRING);
222                         puts(SYSTEMD_FEATURES);
223                         return 0;
224
225                 case 'D':
226                         free(arg_directory);
227                         arg_directory = canonicalize_file_name(optarg);
228                         if (!arg_directory) {
229                                 log_error("Invalid root directory: %m");
230                                 return -ENOMEM;
231                         }
232
233                         break;
234
235                 case 'u':
236                         free(arg_user);
237                         arg_user = strdup(optarg);
238                         if (!arg_user)
239                                 return log_oom();
240
241                         break;
242
243                 case ARG_PRIVATE_NETWORK:
244                         arg_private_network = true;
245                         break;
246
247                 case 'b':
248                         arg_boot = true;
249                         break;
250
251                 case ARG_UUID:
252                         r = sd_id128_from_string(optarg, &arg_uuid);
253                         if (r < 0) {
254                                 log_error("Invalid UUID: %s", optarg);
255                                 return r;
256                         }
257                         break;
258
259                 case 'S':
260                         arg_slice = strdup(optarg);
261                         if (!arg_slice)
262                                 return log_oom();
263
264                         break;
265
266                 case 'M':
267                         if (isempty(optarg)) {
268                                 free(arg_machine);
269                                 arg_machine = NULL;
270                         } else {
271
272                                 if (!hostname_is_valid(optarg)) {
273                                         log_error("Invalid machine name: %s", optarg);
274                                         return -EINVAL;
275                                 }
276
277                                 free(arg_machine);
278                                 arg_machine = strdup(optarg);
279                                 if (!arg_machine)
280                                         return log_oom();
281
282                                 break;
283                         }
284
285                 case 'Z':
286                         arg_selinux_context = optarg;
287                         break;
288
289                 case 'L':
290                         arg_selinux_apifs_context = optarg;
291                         break;
292
293                 case ARG_READ_ONLY:
294                         arg_read_only = true;
295                         break;
296
297                 case ARG_CAPABILITY:
298                 case ARG_DROP_CAPABILITY: {
299                         char *state, *word;
300                         size_t length;
301
302                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
303                                 _cleanup_free_ char *t;
304                                 cap_value_t cap;
305
306                                 t = strndup(word, length);
307                                 if (!t)
308                                         return log_oom();
309
310                                 if (streq(t, "all")) {
311                                         if (c == ARG_CAPABILITY)
312                                                 arg_retain = (uint64_t) -1;
313                                         else
314                                                 arg_retain = 0;
315                                 } else {
316                                         if (cap_from_name(t, &cap) < 0) {
317                                                 log_error("Failed to parse capability %s.", t);
318                                                 return -EINVAL;
319                                         }
320
321                                         if (c == ARG_CAPABILITY)
322                                                 arg_retain |= 1ULL << (uint64_t) cap;
323                                         else
324                                                 arg_retain &= ~(1ULL << (uint64_t) cap);
325                                 }
326                         }
327
328                         break;
329                 }
330
331                 case 'j':
332                         arg_link_journal = LINK_GUEST;
333                         break;
334
335                 case ARG_LINK_JOURNAL:
336                         if (streq(optarg, "auto"))
337                                 arg_link_journal = LINK_AUTO;
338                         else if (streq(optarg, "no"))
339                                 arg_link_journal = LINK_NO;
340                         else if (streq(optarg, "guest"))
341                                 arg_link_journal = LINK_GUEST;
342                         else if (streq(optarg, "host"))
343                                 arg_link_journal = LINK_HOST;
344                         else {
345                                 log_error("Failed to parse link journal mode %s", optarg);
346                                 return -EINVAL;
347                         }
348
349                         break;
350
351                 case ARG_BIND:
352                 case ARG_BIND_RO: {
353                         _cleanup_free_ char *a = NULL, *b = NULL;
354                         char *e;
355                         char ***x;
356
357                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
358
359                         e = strchr(optarg, ':');
360                         if (e) {
361                                 a = strndup(optarg, e - optarg);
362                                 b = strdup(e + 1);
363                         } else {
364                                 a = strdup(optarg);
365                                 b = strdup(optarg);
366                         }
367
368                         if (!a || !b)
369                                 return log_oom();
370
371                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
372                                 log_error("Invalid bind mount specification: %s", optarg);
373                                 return -EINVAL;
374                         }
375
376                         r = strv_extend(x, a);
377                         if (r < 0)
378                                 return log_oom();
379
380                         r = strv_extend(x, b);
381                         if (r < 0)
382                                 return log_oom();
383
384                         break;
385                 }
386
387                 case ARG_SETENV: {
388                         char **n;
389
390                         if (!env_assignment_is_valid(optarg)) {
391                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
392                                 return -EINVAL;
393                         }
394
395                         n = strv_env_set(arg_setenv, optarg);
396                         if (!n)
397                                 return log_oom();
398
399                         strv_free(arg_setenv);
400                         arg_setenv = n;
401                         break;
402                 }
403
404                 case 'q':
405                         arg_quiet = true;
406                         break;
407
408                 case ARG_SHARE_SYSTEM:
409                         arg_share_system = true;
410                         break;
411
412                 case ARG_REGISTER:
413                         r = parse_boolean(optarg);
414                         if (r < 0) {
415                                 log_error("Failed to parse --register= argument: %s", optarg);
416                                 return r;
417                         }
418
419                         arg_register = r;
420                         break;
421
422                 case ARG_KEEP_UNIT:
423                         arg_keep_unit = true;
424                         break;
425
426                 case '?':
427                         return -EINVAL;
428
429                 default:
430                         assert_not_reached("Unhandled option");
431                 }
432         }
433
434         if (arg_share_system)
435                 arg_register = false;
436
437         if (arg_boot && arg_share_system) {
438                 log_error("--boot and --share-system may not be combined.");
439                 return -EINVAL;
440         }
441
442         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
443                 log_error("--keep-unit may not be used when invoked from a user session.");
444                 return -EINVAL;
445         }
446
447         return 1;
448 }
449
450 static int mount_all(const char *dest) {
451
452         typedef struct MountPoint {
453                 const char *what;
454                 const char *where;
455                 const char *type;
456                 const char *options;
457                 unsigned long flags;
458                 bool fatal;
459         } MountPoint;
460
461         static const MountPoint mount_table[] = {
462                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
463                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
464                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
465                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
466                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
467                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
468                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
469                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
470 #ifdef HAVE_SELINUX
471                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
472                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
473 #endif
474         };
475
476         unsigned k;
477         int r = 0;
478
479         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
480                 _cleanup_free_ char *where = NULL;
481 #ifdef HAVE_SELINUX
482                 _cleanup_free_ char *options = NULL;
483 #endif
484                 const char *o;
485                 int t;
486
487                 where = strjoin(dest, "/", mount_table[k].where, NULL);
488                 if (!where)
489                         return log_oom();
490
491                 t = path_is_mount_point(where, true);
492                 if (t < 0) {
493                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
494
495                         if (r == 0)
496                                 r = t;
497
498                         continue;
499                 }
500
501                 /* Skip this entry if it is not a remount. */
502                 if (mount_table[k].what && t > 0)
503                         continue;
504
505                 mkdir_p(where, 0755);
506
507 #ifdef HAVE_SELINUX
508                 if (arg_selinux_apifs_context &&
509                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
510                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
511                         if (!options)
512                                 return log_oom();
513
514                         o = options;
515                 } else
516 #endif
517                         o = mount_table[k].options;
518
519
520                 if (mount(mount_table[k].what,
521                           where,
522                           mount_table[k].type,
523                           mount_table[k].flags,
524                           o) < 0 &&
525                     mount_table[k].fatal) {
526
527                         log_error("mount(%s) failed: %m", where);
528
529                         if (r == 0)
530                                 r = -errno;
531                 }
532         }
533
534         return r;
535 }
536
537 static int mount_binds(const char *dest, char **l, unsigned long flags) {
538         char **x, **y;
539
540         STRV_FOREACH_PAIR(x, y, l) {
541                 char *where;
542                 struct stat source_st, dest_st;
543                 int r;
544
545                 if (stat(*x, &source_st) < 0) {
546                         log_error("failed to stat %s: %m", *x);
547                         return -errno;
548                 }
549
550                 where = strappenda(dest, *y);
551                 r = stat(where, &dest_st);
552                 if (r == 0) {
553                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
554                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
555                                                 *x, where);
556                                 return -EINVAL;
557                         }
558                 } else if (errno == ENOENT) {
559                         r = mkdir_parents_label(where, 0755);
560                         if (r < 0) {
561                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
562                                 return r;
563                         }
564                 } else {
565                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
566                         return -errno;
567                 }
568                 /* Create the mount point, but be conservative -- refuse to create block
569                 * and char devices. */
570                 if (S_ISDIR(source_st.st_mode))
571                         mkdir_label(where, 0755);
572                 else if (S_ISFIFO(source_st.st_mode))
573                         mkfifo(where, 0644);
574                 else if (S_ISSOCK(source_st.st_mode))
575                         mknod(where, 0644 | S_IFSOCK, 0);
576                 else if (S_ISREG(source_st.st_mode))
577                         touch(where);
578                 else {
579                         log_error("Refusing to create mountpoint for file: %s", *x);
580                         return -ENOTSUP;
581                 }
582
583                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
584                         log_error("mount(%s) failed: %m", where);
585                         return -errno;
586                 }
587
588                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
589                         log_error("mount(%s) failed: %m", where);
590                         return -errno;
591                 }
592         }
593
594         return 0;
595 }
596
597 static int setup_timezone(const char *dest) {
598         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
599         char *z, *y;
600         int r;
601
602         assert(dest);
603
604         /* Fix the timezone, if possible */
605         r = readlink_malloc("/etc/localtime", &p);
606         if (r < 0) {
607                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
608                 return 0;
609         }
610
611         z = path_startswith(p, "../usr/share/zoneinfo/");
612         if (!z)
613                 z = path_startswith(p, "/usr/share/zoneinfo/");
614         if (!z) {
615                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
616                 return 0;
617         }
618
619         where = strappend(dest, "/etc/localtime");
620         if (!where)
621                 return log_oom();
622
623         r = readlink_malloc(where, &q);
624         if (r >= 0) {
625                 y = path_startswith(q, "../usr/share/zoneinfo/");
626                 if (!y)
627                         y = path_startswith(q, "/usr/share/zoneinfo/");
628
629
630                 /* Already pointing to the right place? Then do nothing .. */
631                 if (y && streq(y, z))
632                         return 0;
633         }
634
635         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
636         if (!check)
637                 return log_oom();
638
639         if (access(check, F_OK) < 0) {
640                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
641                 return 0;
642         }
643
644         what = strappend("../usr/share/zoneinfo/", z);
645         if (!what)
646                 return log_oom();
647
648         unlink(where);
649         if (symlink(what, where) < 0) {
650                 log_error("Failed to correct timezone of container: %m");
651                 return 0;
652         }
653
654         return 0;
655 }
656
657 static int setup_resolv_conf(const char *dest) {
658         char _cleanup_free_ *where = NULL;
659
660         assert(dest);
661
662         if (arg_private_network)
663                 return 0;
664
665         /* Fix resolv.conf, if possible */
666         where = strappend(dest, "/etc/resolv.conf");
667         if (!where)
668                 return log_oom();
669
670         /* We don't really care for the results of this really. If it
671          * fails, it fails, but meh... */
672         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
673
674         return 0;
675 }
676
677 static int setup_boot_id(const char *dest) {
678         _cleanup_free_ char *from = NULL, *to = NULL;
679         sd_id128_t rnd;
680         char as_uuid[37];
681         int r;
682
683         assert(dest);
684
685         if (arg_share_system)
686                 return 0;
687
688         /* Generate a new randomized boot ID, so that each boot-up of
689          * the container gets a new one */
690
691         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
692         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
693         if (!from || !to)
694                 return log_oom();
695
696         r = sd_id128_randomize(&rnd);
697         if (r < 0) {
698                 log_error("Failed to generate random boot id: %s", strerror(-r));
699                 return r;
700         }
701
702         snprintf(as_uuid, sizeof(as_uuid),
703                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
704                  SD_ID128_FORMAT_VAL(rnd));
705         char_array_0(as_uuid);
706
707         r = write_string_file(from, as_uuid);
708         if (r < 0) {
709                 log_error("Failed to write boot id: %s", strerror(-r));
710                 return r;
711         }
712
713         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
714                 log_error("Failed to bind mount boot id: %m");
715                 r = -errno;
716         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
717                 log_warning("Failed to make boot id read-only: %m");
718
719         unlink(from);
720         return r;
721 }
722
723 static int copy_devnodes(const char *dest) {
724
725         static const char devnodes[] =
726                 "null\0"
727                 "zero\0"
728                 "full\0"
729                 "random\0"
730                 "urandom\0"
731                 "tty\0";
732
733         const char *d;
734         int r = 0;
735         _cleanup_umask_ mode_t u;
736
737         assert(dest);
738
739         u = umask(0000);
740
741         NULSTR_FOREACH(d, devnodes) {
742                 _cleanup_free_ char *from = NULL, *to = NULL;
743                 struct stat st;
744
745                 from = strappend("/dev/", d);
746                 to = strjoin(dest, "/dev/", d, NULL);
747                 if (!from || !to)
748                         return log_oom();
749
750                 if (stat(from, &st) < 0) {
751
752                         if (errno != ENOENT) {
753                                 log_error("Failed to stat %s: %m", from);
754                                 return -errno;
755                         }
756
757                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
758
759                         log_error("%s is not a char or block device, cannot copy", from);
760                         return -EIO;
761
762                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
763
764                         log_error("mknod(%s) failed: %m", dest);
765                         return  -errno;
766                 }
767         }
768
769         return r;
770 }
771
772 static int setup_ptmx(const char *dest) {
773         _cleanup_free_ char *p = NULL;
774
775         p = strappend(dest, "/dev/ptmx");
776         if (!p)
777                 return log_oom();
778
779         if (symlink("pts/ptmx", p) < 0) {
780                 log_error("Failed to create /dev/ptmx symlink: %m");
781                 return -errno;
782         }
783
784         return 0;
785 }
786
787 static int setup_dev_console(const char *dest, const char *console) {
788         struct stat st;
789         _cleanup_free_ char *to = NULL;
790         int r;
791         _cleanup_umask_ mode_t u;
792
793         assert(dest);
794         assert(console);
795
796         u = umask(0000);
797
798         if (stat(console, &st) < 0) {
799                 log_error("Failed to stat %s: %m", console);
800                 return -errno;
801
802         } else if (!S_ISCHR(st.st_mode)) {
803                 log_error("/dev/console is not a char device");
804                 return -EIO;
805         }
806
807         r = chmod_and_chown(console, 0600, 0, 0);
808         if (r < 0) {
809                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
810                 return r;
811         }
812
813         if (asprintf(&to, "%s/dev/console", dest) < 0)
814                 return log_oom();
815
816         /* We need to bind mount the right tty to /dev/console since
817          * ptys can only exist on pts file systems. To have something
818          * to bind mount things on we create a device node first, that
819          * has the right major/minor (note that the major minor
820          * doesn't actually matter here, since we mount it over
821          * anyway). */
822
823         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
824                 log_error("mknod() for /dev/console failed: %m");
825                 return -errno;
826         }
827
828         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
829                 log_error("Bind mount for /dev/console failed: %m");
830                 return -errno;
831         }
832
833         return 0;
834 }
835
836 static int setup_kmsg(const char *dest, int kmsg_socket) {
837         _cleanup_free_ char *from = NULL, *to = NULL;
838         int r, fd, k;
839         _cleanup_umask_ mode_t u;
840         union {
841                 struct cmsghdr cmsghdr;
842                 uint8_t buf[CMSG_SPACE(sizeof(int))];
843         } control = {};
844         struct msghdr mh = {
845                 .msg_control = &control,
846                 .msg_controllen = sizeof(control),
847         };
848         struct cmsghdr *cmsg;
849
850         assert(dest);
851         assert(kmsg_socket >= 0);
852
853         u = umask(0000);
854
855         /* We create the kmsg FIFO as /dev/kmsg, but immediately
856          * delete it after bind mounting it to /proc/kmsg. While FIFOs
857          * on the reading side behave very similar to /proc/kmsg,
858          * their writing side behaves differently from /dev/kmsg in
859          * that writing blocks when nothing is reading. In order to
860          * avoid any problems with containers deadlocking due to this
861          * we simply make /dev/kmsg unavailable to the container. */
862         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
863             asprintf(&to, "%s/proc/kmsg", dest) < 0)
864                 return log_oom();
865
866         if (mkfifo(from, 0600) < 0) {
867                 log_error("mkfifo() for /dev/kmsg failed: %m");
868                 return -errno;
869         }
870
871         r = chmod_and_chown(from, 0600, 0, 0);
872         if (r < 0) {
873                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
874                 return r;
875         }
876
877         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
878                 log_error("Bind mount for /proc/kmsg failed: %m");
879                 return -errno;
880         }
881
882         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
883         if (fd < 0) {
884                 log_error("Failed to open fifo: %m");
885                 return -errno;
886         }
887
888         cmsg = CMSG_FIRSTHDR(&mh);
889         cmsg->cmsg_level = SOL_SOCKET;
890         cmsg->cmsg_type = SCM_RIGHTS;
891         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
892         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
893
894         mh.msg_controllen = cmsg->cmsg_len;
895
896         /* Store away the fd in the socket, so that it stays open as
897          * long as we run the child */
898         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
899         close_nointr_nofail(fd);
900
901         if (k < 0) {
902                 log_error("Failed to send FIFO fd: %m");
903                 return -errno;
904         }
905
906         /* And now make the FIFO unavailable as /dev/kmsg... */
907         unlink(from);
908         return 0;
909 }
910
911 static int setup_hostname(void) {
912
913         if (arg_share_system)
914                 return 0;
915
916         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
917                 return -errno;
918
919         return 0;
920 }
921
922 static int setup_journal(const char *directory) {
923         sd_id128_t machine_id, this_id;
924         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
925         char *id;
926         int r;
927
928         p = strappend(directory, "/etc/machine-id");
929         if (!p)
930                 return log_oom();
931
932         r = read_one_line_file(p, &b);
933         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
934                 return 0;
935         else if (r < 0) {
936                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
937                 return r;
938         }
939
940         id = strstrip(b);
941         if (isempty(id) && arg_link_journal == LINK_AUTO)
942                 return 0;
943
944         /* Verify validity */
945         r = sd_id128_from_string(id, &machine_id);
946         if (r < 0) {
947                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
948                 return r;
949         }
950
951         r = sd_id128_get_machine(&this_id);
952         if (r < 0) {
953                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
954                 return r;
955         }
956
957         if (sd_id128_equal(machine_id, this_id)) {
958                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
959                          "Host and machine ids are equal (%s): refusing to link journals", id);
960                 if (arg_link_journal == LINK_AUTO)
961                         return 0;
962                 return
963                         -EEXIST;
964         }
965
966         if (arg_link_journal == LINK_NO)
967                 return 0;
968
969         free(p);
970         p = strappend("/var/log/journal/", id);
971         q = strjoin(directory, "/var/log/journal/", id, NULL);
972         if (!p || !q)
973                 return log_oom();
974
975         if (path_is_mount_point(p, false) > 0) {
976                 if (arg_link_journal != LINK_AUTO) {
977                         log_error("%s: already a mount point, refusing to use for journal", p);
978                         return -EEXIST;
979                 }
980
981                 return 0;
982         }
983
984         if (path_is_mount_point(q, false) > 0) {
985                 if (arg_link_journal != LINK_AUTO) {
986                         log_error("%s: already a mount point, refusing to use for journal", q);
987                         return -EEXIST;
988                 }
989
990                 return 0;
991         }
992
993         r = readlink_and_make_absolute(p, &d);
994         if (r >= 0) {
995                 if ((arg_link_journal == LINK_GUEST ||
996                      arg_link_journal == LINK_AUTO) &&
997                     path_equal(d, q)) {
998
999                         r = mkdir_p(q, 0755);
1000                         if (r < 0)
1001                                 log_warning("failed to create directory %s: %m", q);
1002                         return 0;
1003                 }
1004
1005                 if (unlink(p) < 0) {
1006                         log_error("Failed to remove symlink %s: %m", p);
1007                         return -errno;
1008                 }
1009         } else if (r == -EINVAL) {
1010
1011                 if (arg_link_journal == LINK_GUEST &&
1012                     rmdir(p) < 0) {
1013
1014                         if (errno == ENOTDIR) {
1015                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1016                                 return r;
1017                         } else {
1018                                 log_error("Failed to remove %s: %m", p);
1019                                 return -errno;
1020                         }
1021                 }
1022         } else if (r != -ENOENT) {
1023                 log_error("readlink(%s) failed: %m", p);
1024                 return r;
1025         }
1026
1027         if (arg_link_journal == LINK_GUEST) {
1028
1029                 if (symlink(q, p) < 0) {
1030                         log_error("Failed to symlink %s to %s: %m", q, p);
1031                         return -errno;
1032                 }
1033
1034                 r = mkdir_p(q, 0755);
1035                 if (r < 0)
1036                         log_warning("failed to create directory %s: %m", q);
1037                 return 0;
1038         }
1039
1040         if (arg_link_journal == LINK_HOST) {
1041                 r = mkdir_p(p, 0755);
1042                 if (r < 0) {
1043                         log_error("Failed to create %s: %m", p);
1044                         return r;
1045                 }
1046
1047         } else if (access(p, F_OK) < 0)
1048                 return 0;
1049
1050         if (dir_is_empty(q) == 0) {
1051                 log_error("%s not empty.", q);
1052                 return -ENOTEMPTY;
1053         }
1054
1055         r = mkdir_p(q, 0755);
1056         if (r < 0) {
1057                 log_error("Failed to create %s: %m", q);
1058                 return r;
1059         }
1060
1061         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1062                 log_error("Failed to bind mount journal from host into guest: %m");
1063                 return -errno;
1064         }
1065
1066         return 0;
1067 }
1068
1069 static int setup_kdbus(const char *dest, const char *path) {
1070         const char *p;
1071
1072         if (!path)
1073                 return 0;
1074
1075         p = strappenda(dest, "/dev/kdbus");
1076         if (mkdir(p, 0755) < 0) {
1077                 log_error("Failed to create kdbus path: %m");
1078                 return  -errno;
1079         }
1080
1081         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1082                 log_error("Failed to mount kdbus domain path: %m");
1083                 return -errno;
1084         }
1085
1086         return 0;
1087 }
1088
1089 static int drop_capabilities(void) {
1090         return capability_bounding_set_drop(~arg_retain, false);
1091 }
1092
1093 static int register_machine(pid_t pid) {
1094         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1095         _cleanup_bus_unref_ sd_bus *bus = NULL;
1096         int r;
1097
1098         if (!arg_register)
1099                 return 0;
1100
1101         r = sd_bus_default_system(&bus);
1102         if (r < 0) {
1103                 log_error("Failed to open system bus: %s", strerror(-r));
1104                 return r;
1105         }
1106
1107         if (arg_keep_unit) {
1108                 r = sd_bus_call_method(
1109                                 bus,
1110                                 "org.freedesktop.machine1",
1111                                 "/org/freedesktop/machine1",
1112                                 "org.freedesktop.machine1.Manager",
1113                                 "RegisterMachine",
1114                                 &error,
1115                                 NULL,
1116                                 "sayssus",
1117                                 arg_machine,
1118                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1119                                 "nspawn",
1120                                 "container",
1121                                 (uint32_t) pid,
1122                                 strempty(arg_directory));
1123         } else {
1124                 r = sd_bus_call_method(
1125                                 bus,
1126                                 "org.freedesktop.machine1",
1127                                 "/org/freedesktop/machine1",
1128                                 "org.freedesktop.machine1.Manager",
1129                                 "CreateMachine",
1130                                 &error,
1131                                 NULL,
1132                                 "sayssusa(sv)",
1133                                 arg_machine,
1134                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1135                                 "nspawn",
1136                                 "container",
1137                                 (uint32_t) pid,
1138                                 strempty(arg_directory),
1139                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1140         }
1141
1142         if (r < 0) {
1143                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1144                 return r;
1145         }
1146
1147         return 0;
1148 }
1149
1150 static int terminate_machine(pid_t pid) {
1151         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1152         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1153         _cleanup_bus_unref_ sd_bus *bus = NULL;
1154         const char *path;
1155         int r;
1156
1157         if (!arg_register)
1158                 return 0;
1159
1160         r = sd_bus_default_system(&bus);
1161         if (r < 0) {
1162                 log_error("Failed to open system bus: %s", strerror(-r));
1163                 return r;
1164         }
1165
1166         r = sd_bus_call_method(
1167                         bus,
1168                         "org.freedesktop.machine1",
1169                         "/org/freedesktop/machine1",
1170                         "org.freedesktop.machine1.Manager",
1171                         "GetMachineByPID",
1172                         &error,
1173                         &reply,
1174                         "u",
1175                         (uint32_t) pid);
1176         if (r < 0) {
1177                 /* Note that the machine might already have been
1178                  * cleaned up automatically, hence don't consider it a
1179                  * failure if we cannot get the machine object. */
1180                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1181                 return 0;
1182         }
1183
1184         r = sd_bus_message_read(reply, "o", &path);
1185         if (r < 0)
1186                 return bus_log_parse_error(r);
1187
1188         r = sd_bus_call_method(
1189                         bus,
1190                         "org.freedesktop.machine1",
1191                         path,
1192                         "org.freedesktop.machine1.Machine",
1193                         "Terminate",
1194                         &error,
1195                         NULL,
1196                         NULL);
1197         if (r < 0) {
1198                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1199                 return 0;
1200         }
1201
1202         return 0;
1203 }
1204
1205 static int reset_audit_loginuid(void) {
1206         _cleanup_free_ char *p = NULL;
1207         int r;
1208
1209         if (arg_share_system)
1210                 return 0;
1211
1212         r = read_one_line_file("/proc/self/loginuid", &p);
1213         if (r == -EEXIST)
1214                 return 0;
1215         if (r < 0) {
1216                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1217                 return r;
1218         }
1219
1220         /* Already reset? */
1221         if (streq(p, "4294967295"))
1222                 return 0;
1223
1224         r = write_string_file("/proc/self/loginuid", "4294967295");
1225         if (r < 0) {
1226                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1227                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1228                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1229                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1230                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1231
1232                 sleep(5);
1233         }
1234
1235         return 0;
1236 }
1237
1238 int main(int argc, char *argv[]) {
1239         pid_t pid = 0;
1240         int r = EXIT_FAILURE, k;
1241         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1242         int n_fd_passed;
1243         const char *console = NULL;
1244         sigset_t mask;
1245         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1246         _cleanup_fdset_free_ FDSet *fds = NULL;
1247         _cleanup_free_ char *kdbus_domain = NULL;
1248
1249         log_parse_environment();
1250         log_open();
1251
1252         k = parse_argv(argc, argv);
1253         if (k < 0)
1254                 goto finish;
1255         else if (k == 0) {
1256                 r = EXIT_SUCCESS;
1257                 goto finish;
1258         }
1259
1260         if (arg_directory) {
1261                 char *p;
1262
1263                 p = path_make_absolute_cwd(arg_directory);
1264                 free(arg_directory);
1265                 arg_directory = p;
1266         } else
1267                 arg_directory = get_current_dir_name();
1268
1269         if (!arg_directory) {
1270                 log_error("Failed to determine path, please use -D.");
1271                 goto finish;
1272         }
1273
1274         path_kill_slashes(arg_directory);
1275
1276         if (!arg_machine) {
1277                 arg_machine = strdup(basename(arg_directory));
1278                 if (!arg_machine) {
1279                         log_oom();
1280                         goto finish;
1281                 }
1282
1283                 hostname_cleanup(arg_machine, false);
1284                 if (isempty(arg_machine)) {
1285                         log_error("Failed to determine machine name automatically, please use -M.");
1286                         goto finish;
1287                 }
1288         }
1289
1290         if (geteuid() != 0) {
1291                 log_error("Need to be root.");
1292                 goto finish;
1293         }
1294
1295         if (sd_booted() <= 0) {
1296                 log_error("Not running on a systemd system.");
1297                 goto finish;
1298         }
1299
1300         if (path_equal(arg_directory, "/")) {
1301                 log_error("Spawning container on root directory not supported.");
1302                 goto finish;
1303         }
1304
1305         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1306                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1307                 goto finish;
1308         }
1309
1310         log_close();
1311         n_fd_passed = sd_listen_fds(false);
1312         if (n_fd_passed > 0) {
1313                 k = fdset_new_listen_fds(&fds, false);
1314                 if (k < 0) {
1315                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1316                         goto finish;
1317                 }
1318         }
1319         fdset_close_others(fds);
1320         log_open();
1321
1322         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1323         if (master < 0) {
1324                 log_error("Failed to acquire pseudo tty: %m");
1325                 goto finish;
1326         }
1327
1328         console = ptsname(master);
1329         if (!console) {
1330                 log_error("Failed to determine tty name: %m");
1331                 goto finish;
1332         }
1333
1334         if (!arg_quiet)
1335                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1336
1337         if (unlockpt(master) < 0) {
1338                 log_error("Failed to unlock tty: %m");
1339                 goto finish;
1340         }
1341
1342
1343         if (access("/dev/kdbus/control", F_OK) >= 0) {
1344
1345                 if (arg_share_system) {
1346                         kdbus_domain = strdup("/dev/kdbus");
1347                         if (!kdbus_domain) {
1348                                 log_oom();
1349                                 goto finish;
1350                         }
1351                 } else {
1352                         const char *ns;
1353
1354                         ns = strappenda("machine-", arg_machine);
1355                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1356                         if (r < 0)
1357                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1358                         else
1359                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1360                 }
1361         }
1362
1363         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1364                 log_error("Failed to create kmsg socket pair: %m");
1365                 goto finish;
1366         }
1367
1368         sd_notify(0, "READY=1");
1369
1370         assert_se(sigemptyset(&mask) == 0);
1371         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1372         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1373
1374         for (;;) {
1375                 siginfo_t status;
1376
1377                 sync_fd = eventfd(0, EFD_CLOEXEC);
1378                 if (sync_fd < 0) {
1379                         log_error("Failed to create event fd: %m");
1380                         goto finish;
1381                 }
1382
1383                 pid = syscall(__NR_clone,
1384                               SIGCHLD|CLONE_NEWNS|
1385                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1386                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1387                 if (pid < 0) {
1388                         if (errno == EINVAL)
1389                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1390                         else
1391                                 log_error("clone() failed: %m");
1392
1393                         goto finish;
1394                 }
1395
1396                 if (pid == 0) {
1397                         /* child */
1398                         const char *home = NULL;
1399                         uid_t uid = (uid_t) -1;
1400                         gid_t gid = (gid_t) -1;
1401                         unsigned n_env = 2;
1402                         const char *envp[] = {
1403                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1404                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1405                                 NULL, /* TERM */
1406                                 NULL, /* HOME */
1407                                 NULL, /* USER */
1408                                 NULL, /* LOGNAME */
1409                                 NULL, /* container_uuid */
1410                                 NULL, /* LISTEN_FDS */
1411                                 NULL, /* LISTEN_PID */
1412                                 NULL
1413                         };
1414                         char **env_use;
1415                         eventfd_t x;
1416
1417                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1418                         if (envp[n_env])
1419                                 n_env ++;
1420
1421                         close_nointr_nofail(master);
1422                         master = -1;
1423
1424                         close_nointr(STDIN_FILENO);
1425                         close_nointr(STDOUT_FILENO);
1426                         close_nointr(STDERR_FILENO);
1427
1428                         close_nointr_nofail(kmsg_socket_pair[0]);
1429                         kmsg_socket_pair[0] = -1;
1430
1431                         reset_all_signal_handlers();
1432
1433                         assert_se(sigemptyset(&mask) == 0);
1434                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1435
1436                         k = open_terminal(console, O_RDWR);
1437                         if (k != STDIN_FILENO) {
1438                                 if (k >= 0) {
1439                                         close_nointr_nofail(k);
1440                                         k = -EINVAL;
1441                                 }
1442
1443                                 log_error("Failed to open console: %s", strerror(-k));
1444                                 goto child_fail;
1445                         }
1446
1447                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1448                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1449                                 log_error("Failed to duplicate console: %m");
1450                                 goto child_fail;
1451                         }
1452
1453                         if (setsid() < 0) {
1454                                 log_error("setsid() failed: %m");
1455                                 goto child_fail;
1456                         }
1457
1458                         if (reset_audit_loginuid() < 0)
1459                                 goto child_fail;
1460
1461                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1462                                 log_error("PR_SET_PDEATHSIG failed: %m");
1463                                 goto child_fail;
1464                         }
1465
1466                         /* Mark everything as slave, so that we still
1467                          * receive mounts from the real root, but don't
1468                          * propagate mounts to the real root. */
1469                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1470                                 log_error("MS_SLAVE|MS_REC failed: %m");
1471                                 goto child_fail;
1472                         }
1473
1474                         /* Turn directory into bind mount */
1475                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1476                                 log_error("Failed to make bind mount.");
1477                                 goto child_fail;
1478                         }
1479
1480                         if (arg_read_only)
1481                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1482                                         log_error("Failed to make read-only.");
1483                                         goto child_fail;
1484                                 }
1485
1486                         if (mount_all(arg_directory) < 0)
1487                                 goto child_fail;
1488
1489                         if (copy_devnodes(arg_directory) < 0)
1490                                 goto child_fail;
1491
1492                         if (setup_ptmx(arg_directory) < 0)
1493                                 goto child_fail;
1494
1495                         dev_setup(arg_directory);
1496
1497                         if (setup_dev_console(arg_directory, console) < 0)
1498                                 goto child_fail;
1499
1500                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1501                                 goto child_fail;
1502
1503                         close_nointr_nofail(kmsg_socket_pair[1]);
1504                         kmsg_socket_pair[1] = -1;
1505
1506                         if (setup_boot_id(arg_directory) < 0)
1507                                 goto child_fail;
1508
1509                         if (setup_timezone(arg_directory) < 0)
1510                                 goto child_fail;
1511
1512                         if (setup_resolv_conf(arg_directory) < 0)
1513                                 goto child_fail;
1514
1515                         if (setup_journal(arg_directory) < 0)
1516                                 goto child_fail;
1517
1518                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1519                                 goto child_fail;
1520
1521                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1522                                 goto child_fail;
1523
1524                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1525                                 goto child_fail;
1526
1527                         if (chdir(arg_directory) < 0) {
1528                                 log_error("chdir(%s) failed: %m", arg_directory);
1529                                 goto child_fail;
1530                         }
1531
1532                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1533                                 log_error("mount(MS_MOVE) failed: %m");
1534                                 goto child_fail;
1535                         }
1536
1537                         if (chroot(".") < 0) {
1538                                 log_error("chroot() failed: %m");
1539                                 goto child_fail;
1540                         }
1541
1542                         if (chdir("/") < 0) {
1543                                 log_error("chdir() failed: %m");
1544                                 goto child_fail;
1545                         }
1546
1547                         umask(0022);
1548
1549                         if (arg_private_network)
1550                                 loopback_setup();
1551
1552                         if (drop_capabilities() < 0) {
1553                                 log_error("drop_capabilities() failed: %m");
1554                                 goto child_fail;
1555                         }
1556
1557                         if (arg_user) {
1558
1559                                 /* Note that this resolves user names
1560                                  * inside the container, and hence
1561                                  * accesses the NSS modules from the
1562                                  * container and not the host. This is
1563                                  * a bit weird... */
1564
1565                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1566                                         log_error("get_user_creds() failed: %m");
1567                                         goto child_fail;
1568                                 }
1569
1570                                 if (mkdir_parents_label(home, 0775) < 0) {
1571                                         log_error("mkdir_parents_label() failed: %m");
1572                                         goto child_fail;
1573                                 }
1574
1575                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1576                                         log_error("mkdir_safe_label() failed: %m");
1577                                         goto child_fail;
1578                                 }
1579
1580                                 if (initgroups((const char*)arg_user, gid) < 0) {
1581                                         log_error("initgroups() failed: %m");
1582                                         goto child_fail;
1583                                 }
1584
1585                                 if (setresgid(gid, gid, gid) < 0) {
1586                                         log_error("setregid() failed: %m");
1587                                         goto child_fail;
1588                                 }
1589
1590                                 if (setresuid(uid, uid, uid) < 0) {
1591                                         log_error("setreuid() failed: %m");
1592                                         goto child_fail;
1593                                 }
1594                         } else {
1595                                 /* Reset everything fully to 0, just in case */
1596
1597                                 if (setgroups(0, NULL) < 0) {
1598                                         log_error("setgroups() failed: %m");
1599                                         goto child_fail;
1600                                 }
1601
1602                                 if (setresgid(0, 0, 0) < 0) {
1603                                         log_error("setregid() failed: %m");
1604                                         goto child_fail;
1605                                 }
1606
1607                                 if (setresuid(0, 0, 0) < 0) {
1608                                         log_error("setreuid() failed: %m");
1609                                         goto child_fail;
1610                                 }
1611                         }
1612
1613                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1614                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1615                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1616                                 log_oom();
1617                                 goto child_fail;
1618                         }
1619
1620                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1621                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1622                                         log_oom();
1623                                         goto child_fail;
1624                                 }
1625                         }
1626
1627                         if (fdset_size(fds) > 0) {
1628                                 k = fdset_cloexec(fds, false);
1629                                 if (k < 0) {
1630                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1631                                         goto child_fail;
1632                                 }
1633
1634                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1635                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1636                                         log_oom();
1637                                         goto child_fail;
1638                                 }
1639                         }
1640
1641                         setup_hostname();
1642
1643                         eventfd_read(sync_fd, &x);
1644                         close_nointr_nofail(sync_fd);
1645                         sync_fd = -1;
1646
1647                         if (!strv_isempty(arg_setenv)) {
1648                                 char **n;
1649
1650                                 n = strv_env_merge(2, envp, arg_setenv);
1651                                 if (!n) {
1652                                         log_oom();
1653                                         goto child_fail;
1654                                 }
1655
1656                                 env_use = n;
1657                         } else
1658                                 env_use = (char**) envp;
1659
1660 #ifdef HAVE_SELINUX
1661                         if (arg_selinux_context)
1662                                 if (setexeccon(arg_selinux_context) < 0)
1663                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1664 #endif
1665                         if (arg_boot) {
1666                                 char **a;
1667                                 size_t l;
1668
1669                                 /* Automatically search for the init system */
1670
1671                                 l = 1 + argc - optind;
1672                                 a = newa(char*, l + 1);
1673                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1674
1675                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1676                                 execve(a[0], a, env_use);
1677
1678                                 a[0] = (char*) "/lib/systemd/systemd";
1679                                 execve(a[0], a, env_use);
1680
1681                                 a[0] = (char*) "/sbin/init";
1682                                 execve(a[0], a, env_use);
1683                         } else if (argc > optind)
1684                                 execvpe(argv[optind], argv + optind, env_use);
1685                         else {
1686                                 chdir(home ? home : "/root");
1687                                 execle("/bin/bash", "-bash", NULL, env_use);
1688                         }
1689
1690                         log_error("execv() failed: %m");
1691
1692                 child_fail:
1693                         _exit(EXIT_FAILURE);
1694                 }
1695
1696                 fdset_free(fds);
1697                 fds = NULL;
1698
1699                 r = register_machine(pid);
1700                 if (r < 0)
1701                         goto finish;
1702
1703                 eventfd_write(sync_fd, 1);
1704                 close_nointr_nofail(sync_fd);
1705                 sync_fd = -1;
1706
1707                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1708                 if (k < 0) {
1709                         r = EXIT_FAILURE;
1710                         break;
1711                 }
1712
1713                 if (!arg_quiet)
1714                         putc('\n', stdout);
1715
1716                 /* Kill if it is not dead yet anyway */
1717                 terminate_machine(pid);
1718
1719                 /* Redundant, but better safe than sorry */
1720                 kill(pid, SIGKILL);
1721
1722                 k = wait_for_terminate(pid, &status);
1723                 pid = 0;
1724
1725                 if (k < 0) {
1726                         r = EXIT_FAILURE;
1727                         break;
1728                 }
1729
1730                 if (status.si_code == CLD_EXITED) {
1731                         r = status.si_status;
1732                         if (status.si_status != 0) {
1733                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1734                                 break;
1735                         }
1736
1737                         if (!arg_quiet)
1738                                 log_debug("Container %s exited successfully.", arg_machine);
1739                         break;
1740                 } else if (status.si_code == CLD_KILLED &&
1741                            status.si_status == SIGINT) {
1742
1743                         if (!arg_quiet)
1744                                 log_info("Container %s has been shut down.", arg_machine);
1745                         r = 0;
1746                         break;
1747                 } else if (status.si_code == CLD_KILLED &&
1748                            status.si_status == SIGHUP) {
1749
1750                         if (!arg_quiet)
1751                                 log_info("Container %s is being rebooted.", arg_machine);
1752                         continue;
1753                 } else if (status.si_code == CLD_KILLED ||
1754                            status.si_code == CLD_DUMPED) {
1755
1756                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1757                         r = EXIT_FAILURE;
1758                         break;
1759                 } else {
1760                         log_error("Container %s failed due to unknown reason.", arg_machine);
1761                         r = EXIT_FAILURE;
1762                         break;
1763                 }
1764         }
1765
1766 finish:
1767         if (pid > 0)
1768                 kill(pid, SIGKILL);
1769
1770         free(arg_directory);
1771         free(arg_machine);
1772         free(arg_setenv);
1773
1774         return r;
1775 }