chiark / gitweb /
97ef6c799d4281705e97e4533c6f60feb7bda604
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #ifdef HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #include "sd-daemon.h"
49 #include "sd-bus.h"
50 #include "sd-id128.h"
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
62 #include "fdset.h"
63 #include "build.h"
64 #include "fileio.h"
65 #include "bus-util.h"
66 #include "bus-error.h"
67 #include "ptyfwd.h"
68 #include "bus-kernel.h"
69 #include "env-util.h"
70 #include "def.h"
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
91         (1ULL << CAP_CHOWN) |
92         (1ULL << CAP_DAC_OVERRIDE) |
93         (1ULL << CAP_DAC_READ_SEARCH) |
94         (1ULL << CAP_FOWNER) |
95         (1ULL << CAP_FSETID) |
96         (1ULL << CAP_IPC_OWNER) |
97         (1ULL << CAP_KILL) |
98         (1ULL << CAP_LEASE) |
99         (1ULL << CAP_LINUX_IMMUTABLE) |
100         (1ULL << CAP_NET_BIND_SERVICE) |
101         (1ULL << CAP_NET_BROADCAST) |
102         (1ULL << CAP_NET_RAW) |
103         (1ULL << CAP_SETGID) |
104         (1ULL << CAP_SETFCAP) |
105         (1ULL << CAP_SETPCAP) |
106         (1ULL << CAP_SETUID) |
107         (1ULL << CAP_SYS_ADMIN) |
108         (1ULL << CAP_SYS_CHROOT) |
109         (1ULL << CAP_SYS_NICE) |
110         (1ULL << CAP_SYS_PTRACE) |
111         (1ULL << CAP_SYS_TTY_CONFIG) |
112         (1ULL << CAP_SYS_RESOURCE) |
113         (1ULL << CAP_SYS_BOOT) |
114         (1ULL << CAP_AUDIT_WRITE) |
115         (1ULL << CAP_AUDIT_CONTROL) |
116         (1ULL << CAP_MKNOD);
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122 static bool arg_register = true;
123 static bool arg_keep_unit = false;
124
125 static int help(void) {
126
127         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
128                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
129                "  -h --help                 Show this help\n"
130                "     --version              Print version string\n"
131                "  -D --directory=NAME       Root directory for the container\n"
132                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
133                "  -u --user=USER            Run the command under specified user or uid\n"
134                "     --uuid=UUID            Set a specific machine UUID for the container\n"
135                "  -M --machine=NAME         Set the machine name for the container\n"
136                "  -S --slice=SLICE          Place the container in the specified slice\n"
137                "  -Z --selinux-context=SECLABEL\n"
138                "                            Set the SELinux security context to be used by\n"
139                "                            processes in the container\n"
140                "  -L --selinux-apifs-context=SECLABEL\n"
141                "                            Set the SELinux security context to be used by\n"
142                "                            API/tmpfs file systems in the container\n"
143                "     --private-network      Disable network in container\n"
144                "     --share-system         Share system namespaces with host\n"
145                "     --read-only            Mount the root directory read-only\n"
146                "     --capability=CAP       In addition to the default, retain specified\n"
147                "                            capability\n"
148                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
149                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
150                "  -j                        Equivalent to --link-journal=host\n"
151                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
152                "                            the container\n"
153                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
154                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
155                "     --register=BOOLEAN     Register container as machine\n"
156                "     --keep-unit            Do not register a scope for the machine, reuse\n"
157                "                            the service unit nspawn is running in\n"
158                "  -q --quiet                Do not show status information\n",
159                program_invocation_short_name);
160
161         return 0;
162 }
163
164 static int parse_argv(int argc, char *argv[]) {
165
166         enum {
167                 ARG_VERSION = 0x100,
168                 ARG_PRIVATE_NETWORK,
169                 ARG_UUID,
170                 ARG_READ_ONLY,
171                 ARG_CAPABILITY,
172                 ARG_DROP_CAPABILITY,
173                 ARG_LINK_JOURNAL,
174                 ARG_BIND,
175                 ARG_BIND_RO,
176                 ARG_SETENV,
177                 ARG_SHARE_SYSTEM,
178                 ARG_REGISTER,
179                 ARG_KEEP_UNIT
180         };
181
182         static const struct option options[] = {
183                 { "help",                  no_argument,       NULL, 'h'                 },
184                 { "version",               no_argument,       NULL, ARG_VERSION         },
185                 { "directory",             required_argument, NULL, 'D'                 },
186                 { "user",                  required_argument, NULL, 'u'                 },
187                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK },
188                 { "boot",                  no_argument,       NULL, 'b'                 },
189                 { "uuid",                  required_argument, NULL, ARG_UUID            },
190                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY       },
191                 { "capability",            required_argument, NULL, ARG_CAPABILITY      },
192                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY },
193                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL    },
194                 { "bind",                  required_argument, NULL, ARG_BIND            },
195                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO         },
196                 { "machine",               required_argument, NULL, 'M'                 },
197                 { "slice",                 required_argument, NULL, 'S'                 },
198                 { "setenv",                required_argument, NULL, ARG_SETENV          },
199                 { "selinux-context",       required_argument, NULL, 'Z'                 },
200                 { "selinux-apifs-context", required_argument, NULL, 'L'                 },
201                 { "quiet",                 no_argument,       NULL, 'q'                 },
202                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM    },
203                 { "register",              required_argument, NULL, ARG_REGISTER        },
204                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT       },
205                 {}
206         };
207
208         int c, r;
209
210         assert(argc >= 0);
211         assert(argv);
212
213         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
214
215                 switch (c) {
216
217                 case 'h':
218                         return help();
219
220                 case ARG_VERSION:
221                         puts(PACKAGE_STRING);
222                         puts(SYSTEMD_FEATURES);
223                         return 0;
224
225                 case 'D':
226                         free(arg_directory);
227                         arg_directory = canonicalize_file_name(optarg);
228                         if (!arg_directory) {
229                                 log_error("Invalid root directory: %m");
230                                 return -ENOMEM;
231                         }
232
233                         break;
234
235                 case 'u':
236                         free(arg_user);
237                         arg_user = strdup(optarg);
238                         if (!arg_user)
239                                 return log_oom();
240
241                         break;
242
243                 case ARG_PRIVATE_NETWORK:
244                         arg_private_network = true;
245                         break;
246
247                 case 'b':
248                         arg_boot = true;
249                         break;
250
251                 case ARG_UUID:
252                         r = sd_id128_from_string(optarg, &arg_uuid);
253                         if (r < 0) {
254                                 log_error("Invalid UUID: %s", optarg);
255                                 return r;
256                         }
257                         break;
258
259                 case 'S':
260                         arg_slice = strdup(optarg);
261                         if (!arg_slice)
262                                 return log_oom();
263
264                         break;
265
266                 case 'M':
267                         if (isempty(optarg)) {
268                                 free(arg_machine);
269                                 arg_machine = NULL;
270                         } else {
271
272                                 if (!hostname_is_valid(optarg)) {
273                                         log_error("Invalid machine name: %s", optarg);
274                                         return -EINVAL;
275                                 }
276
277                                 free(arg_machine);
278                                 arg_machine = strdup(optarg);
279                                 if (!arg_machine)
280                                         return log_oom();
281
282                                 break;
283                         }
284
285                 case 'Z':
286                         arg_selinux_context = optarg;
287                         break;
288
289                 case 'L':
290                         arg_selinux_apifs_context = optarg;
291                         break;
292
293                 case ARG_READ_ONLY:
294                         arg_read_only = true;
295                         break;
296
297                 case ARG_CAPABILITY:
298                 case ARG_DROP_CAPABILITY: {
299                         char *state, *word;
300                         size_t length;
301
302                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
303                                 cap_value_t cap;
304                                 char *t;
305
306                                 t = strndup(word, length);
307                                 if (!t)
308                                         return log_oom();
309
310                                 if (cap_from_name(t, &cap) < 0) {
311                                         log_error("Failed to parse capability %s.", t);
312                                         free(t);
313                                         return -EINVAL;
314                                 }
315
316                                 free(t);
317
318                                 if (c == ARG_CAPABILITY)
319                                         arg_retain |= 1ULL << (uint64_t) cap;
320                                 else
321                                         arg_retain &= ~(1ULL << (uint64_t) cap);
322                         }
323
324                         break;
325                 }
326
327                 case 'j':
328                         arg_link_journal = LINK_GUEST;
329                         break;
330
331                 case ARG_LINK_JOURNAL:
332                         if (streq(optarg, "auto"))
333                                 arg_link_journal = LINK_AUTO;
334                         else if (streq(optarg, "no"))
335                                 arg_link_journal = LINK_NO;
336                         else if (streq(optarg, "guest"))
337                                 arg_link_journal = LINK_GUEST;
338                         else if (streq(optarg, "host"))
339                                 arg_link_journal = LINK_HOST;
340                         else {
341                                 log_error("Failed to parse link journal mode %s", optarg);
342                                 return -EINVAL;
343                         }
344
345                         break;
346
347                 case ARG_BIND:
348                 case ARG_BIND_RO: {
349                         _cleanup_free_ char *a = NULL, *b = NULL;
350                         char *e;
351                         char ***x;
352
353                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
354
355                         e = strchr(optarg, ':');
356                         if (e) {
357                                 a = strndup(optarg, e - optarg);
358                                 b = strdup(e + 1);
359                         } else {
360                                 a = strdup(optarg);
361                                 b = strdup(optarg);
362                         }
363
364                         if (!a || !b)
365                                 return log_oom();
366
367                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
368                                 log_error("Invalid bind mount specification: %s", optarg);
369                                 return -EINVAL;
370                         }
371
372                         r = strv_extend(x, a);
373                         if (r < 0)
374                                 return log_oom();
375
376                         r = strv_extend(x, b);
377                         if (r < 0)
378                                 return log_oom();
379
380                         break;
381                 }
382
383                 case ARG_SETENV: {
384                         char **n;
385
386                         if (!env_assignment_is_valid(optarg)) {
387                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
388                                 return -EINVAL;
389                         }
390
391                         n = strv_env_set(arg_setenv, optarg);
392                         if (!n)
393                                 return log_oom();
394
395                         strv_free(arg_setenv);
396                         arg_setenv = n;
397                         break;
398                 }
399
400                 case 'q':
401                         arg_quiet = true;
402                         break;
403
404                 case ARG_SHARE_SYSTEM:
405                         arg_share_system = true;
406                         break;
407
408                 case ARG_REGISTER:
409                         r = parse_boolean(optarg);
410                         if (r < 0) {
411                                 log_error("Failed to parse --register= argument: %s", optarg);
412                                 return r;
413                         }
414
415                         arg_register = r;
416                         break;
417
418                 case ARG_KEEP_UNIT:
419                         arg_keep_unit = true;
420                         break;
421
422                 case '?':
423                         return -EINVAL;
424
425                 default:
426                         assert_not_reached("Unhandled option");
427                 }
428         }
429
430         if (arg_share_system)
431                 arg_register = false;
432
433         if (arg_boot && arg_share_system) {
434                 log_error("--boot and --share-system may not be combined.");
435                 return -EINVAL;
436         }
437
438         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
439                 log_error("--keep-unit may not be used when invoked from a user session.");
440                 return -EINVAL;
441         }
442
443         return 1;
444 }
445
446 static int mount_all(const char *dest) {
447
448         typedef struct MountPoint {
449                 const char *what;
450                 const char *where;
451                 const char *type;
452                 const char *options;
453                 unsigned long flags;
454                 bool fatal;
455         } MountPoint;
456
457         static const MountPoint mount_table[] = {
458                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
459                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
460                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
461                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
462                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
463                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
464                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
465                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
466 #ifdef HAVE_SELINUX
467                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
468                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
469 #endif
470         };
471
472         unsigned k;
473         int r = 0;
474
475         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
476                 _cleanup_free_ char *where = NULL;
477 #ifdef HAVE_SELINUX
478                 _cleanup_free_ char *options = NULL;
479 #endif
480                 const char *o;
481                 int t;
482
483                 where = strjoin(dest, "/", mount_table[k].where, NULL);
484                 if (!where)
485                         return log_oom();
486
487                 t = path_is_mount_point(where, true);
488                 if (t < 0) {
489                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
490
491                         if (r == 0)
492                                 r = t;
493
494                         continue;
495                 }
496
497                 /* Skip this entry if it is not a remount. */
498                 if (mount_table[k].what && t > 0)
499                         continue;
500
501                 mkdir_p(where, 0755);
502
503 #ifdef HAVE_SELINUX
504                 if (arg_selinux_apifs_context &&
505                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
506                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
507                         if (!options)
508                                 return log_oom();
509
510                         o = options;
511                 } else
512 #endif
513                         o = mount_table[k].options;
514
515
516                 if (mount(mount_table[k].what,
517                           where,
518                           mount_table[k].type,
519                           mount_table[k].flags,
520                           o) < 0 &&
521                     mount_table[k].fatal) {
522
523                         log_error("mount(%s) failed: %m", where);
524
525                         if (r == 0)
526                                 r = -errno;
527                 }
528         }
529
530         return r;
531 }
532
533 static int mount_binds(const char *dest, char **l, unsigned long flags) {
534         char **x, **y;
535
536         STRV_FOREACH_PAIR(x, y, l) {
537                 char *where;
538                 struct stat source_st, dest_st;
539                 int r;
540
541                 if (stat(*x, &source_st) < 0) {
542                         log_error("failed to stat %s: %m", *x);
543                         return -errno;
544                 }
545
546                 where = strappenda(dest, *y);
547                 r = stat(where, &dest_st);
548                 if (r == 0) {
549                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
550                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
551                                                 *x, where);
552                                 return -EINVAL;
553                         }
554                 } else if (errno == ENOENT) {
555                         r = mkdir_parents_label(where, 0755);
556                         if (r < 0) {
557                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
558                                 return r;
559                         }
560                 } else {
561                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
562                         return -errno;
563                 }
564                 /* Create the mount point, but be conservative -- refuse to create block
565                 * and char devices. */
566                 if (S_ISDIR(source_st.st_mode))
567                         mkdir_label(where, 0755);
568                 else if (S_ISFIFO(source_st.st_mode))
569                         mkfifo(where, 0644);
570                 else if (S_ISSOCK(source_st.st_mode))
571                         mknod(where, 0644 | S_IFSOCK, 0);
572                 else if (S_ISREG(source_st.st_mode))
573                         touch(where);
574                 else {
575                         log_error("Refusing to create mountpoint for file: %s", *x);
576                         return -ENOTSUP;
577                 }
578
579                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
580                         log_error("mount(%s) failed: %m", where);
581                         return -errno;
582                 }
583
584                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
585                         log_error("mount(%s) failed: %m", where);
586                         return -errno;
587                 }
588         }
589
590         return 0;
591 }
592
593 static int setup_timezone(const char *dest) {
594         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
595         char *z, *y;
596         int r;
597
598         assert(dest);
599
600         /* Fix the timezone, if possible */
601         r = readlink_malloc("/etc/localtime", &p);
602         if (r < 0) {
603                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
604                 return 0;
605         }
606
607         z = path_startswith(p, "../usr/share/zoneinfo/");
608         if (!z)
609                 z = path_startswith(p, "/usr/share/zoneinfo/");
610         if (!z) {
611                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
612                 return 0;
613         }
614
615         where = strappend(dest, "/etc/localtime");
616         if (!where)
617                 return log_oom();
618
619         r = readlink_malloc(where, &q);
620         if (r >= 0) {
621                 y = path_startswith(q, "../usr/share/zoneinfo/");
622                 if (!y)
623                         y = path_startswith(q, "/usr/share/zoneinfo/");
624
625
626                 /* Already pointing to the right place? Then do nothing .. */
627                 if (y && streq(y, z))
628                         return 0;
629         }
630
631         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
632         if (!check)
633                 return log_oom();
634
635         if (access(check, F_OK) < 0) {
636                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
637                 return 0;
638         }
639
640         what = strappend("../usr/share/zoneinfo/", z);
641         if (!what)
642                 return log_oom();
643
644         unlink(where);
645         if (symlink(what, where) < 0) {
646                 log_error("Failed to correct timezone of container: %m");
647                 return 0;
648         }
649
650         return 0;
651 }
652
653 static int setup_resolv_conf(const char *dest) {
654         char _cleanup_free_ *where = NULL;
655
656         assert(dest);
657
658         if (arg_private_network)
659                 return 0;
660
661         /* Fix resolv.conf, if possible */
662         where = strappend(dest, "/etc/resolv.conf");
663         if (!where)
664                 return log_oom();
665
666         /* We don't really care for the results of this really. If it
667          * fails, it fails, but meh... */
668         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
669
670         return 0;
671 }
672
673 static int setup_boot_id(const char *dest) {
674         _cleanup_free_ char *from = NULL, *to = NULL;
675         sd_id128_t rnd;
676         char as_uuid[37];
677         int r;
678
679         assert(dest);
680
681         if (arg_share_system)
682                 return 0;
683
684         /* Generate a new randomized boot ID, so that each boot-up of
685          * the container gets a new one */
686
687         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
688         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
689         if (!from || !to)
690                 return log_oom();
691
692         r = sd_id128_randomize(&rnd);
693         if (r < 0) {
694                 log_error("Failed to generate random boot id: %s", strerror(-r));
695                 return r;
696         }
697
698         snprintf(as_uuid, sizeof(as_uuid),
699                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
700                  SD_ID128_FORMAT_VAL(rnd));
701         char_array_0(as_uuid);
702
703         r = write_string_file(from, as_uuid);
704         if (r < 0) {
705                 log_error("Failed to write boot id: %s", strerror(-r));
706                 return r;
707         }
708
709         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
710                 log_error("Failed to bind mount boot id: %m");
711                 r = -errno;
712         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
713                 log_warning("Failed to make boot id read-only: %m");
714
715         unlink(from);
716         return r;
717 }
718
719 static int copy_devnodes(const char *dest) {
720
721         static const char devnodes[] =
722                 "null\0"
723                 "zero\0"
724                 "full\0"
725                 "random\0"
726                 "urandom\0"
727                 "tty\0";
728
729         const char *d;
730         int r = 0;
731         _cleanup_umask_ mode_t u;
732
733         assert(dest);
734
735         u = umask(0000);
736
737         NULSTR_FOREACH(d, devnodes) {
738                 _cleanup_free_ char *from = NULL, *to = NULL;
739                 struct stat st;
740
741                 from = strappend("/dev/", d);
742                 to = strjoin(dest, "/dev/", d, NULL);
743                 if (!from || !to)
744                         return log_oom();
745
746                 if (stat(from, &st) < 0) {
747
748                         if (errno != ENOENT) {
749                                 log_error("Failed to stat %s: %m", from);
750                                 return -errno;
751                         }
752
753                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
754
755                         log_error("%s is not a char or block device, cannot copy", from);
756                         return -EIO;
757
758                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
759
760                         log_error("mknod(%s) failed: %m", dest);
761                         return  -errno;
762                 }
763         }
764
765         return r;
766 }
767
768 static int setup_ptmx(const char *dest) {
769         _cleanup_free_ char *p = NULL;
770
771         p = strappend(dest, "/dev/ptmx");
772         if (!p)
773                 return log_oom();
774
775         if (symlink("pts/ptmx", p) < 0) {
776                 log_error("Failed to create /dev/ptmx symlink: %m");
777                 return -errno;
778         }
779
780         return 0;
781 }
782
783 static int setup_dev_console(const char *dest, const char *console) {
784         struct stat st;
785         _cleanup_free_ char *to = NULL;
786         int r;
787         _cleanup_umask_ mode_t u;
788
789         assert(dest);
790         assert(console);
791
792         u = umask(0000);
793
794         if (stat(console, &st) < 0) {
795                 log_error("Failed to stat %s: %m", console);
796                 return -errno;
797
798         } else if (!S_ISCHR(st.st_mode)) {
799                 log_error("/dev/console is not a char device");
800                 return -EIO;
801         }
802
803         r = chmod_and_chown(console, 0600, 0, 0);
804         if (r < 0) {
805                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
806                 return r;
807         }
808
809         if (asprintf(&to, "%s/dev/console", dest) < 0)
810                 return log_oom();
811
812         /* We need to bind mount the right tty to /dev/console since
813          * ptys can only exist on pts file systems. To have something
814          * to bind mount things on we create a device node first, that
815          * has the right major/minor (note that the major minor
816          * doesn't actually matter here, since we mount it over
817          * anyway). */
818
819         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
820                 log_error("mknod() for /dev/console failed: %m");
821                 return -errno;
822         }
823
824         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
825                 log_error("Bind mount for /dev/console failed: %m");
826                 return -errno;
827         }
828
829         return 0;
830 }
831
832 static int setup_kmsg(const char *dest, int kmsg_socket) {
833         _cleanup_free_ char *from = NULL, *to = NULL;
834         int r, fd, k;
835         _cleanup_umask_ mode_t u;
836         union {
837                 struct cmsghdr cmsghdr;
838                 uint8_t buf[CMSG_SPACE(sizeof(int))];
839         } control = {};
840         struct msghdr mh = {
841                 .msg_control = &control,
842                 .msg_controllen = sizeof(control),
843         };
844         struct cmsghdr *cmsg;
845
846         assert(dest);
847         assert(kmsg_socket >= 0);
848
849         u = umask(0000);
850
851         /* We create the kmsg FIFO as /dev/kmsg, but immediately
852          * delete it after bind mounting it to /proc/kmsg. While FIFOs
853          * on the reading side behave very similar to /proc/kmsg,
854          * their writing side behaves differently from /dev/kmsg in
855          * that writing blocks when nothing is reading. In order to
856          * avoid any problems with containers deadlocking due to this
857          * we simply make /dev/kmsg unavailable to the container. */
858         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
859             asprintf(&to, "%s/proc/kmsg", dest) < 0)
860                 return log_oom();
861
862         if (mkfifo(from, 0600) < 0) {
863                 log_error("mkfifo() for /dev/kmsg failed: %m");
864                 return -errno;
865         }
866
867         r = chmod_and_chown(from, 0600, 0, 0);
868         if (r < 0) {
869                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
870                 return r;
871         }
872
873         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
874                 log_error("Bind mount for /proc/kmsg failed: %m");
875                 return -errno;
876         }
877
878         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
879         if (fd < 0) {
880                 log_error("Failed to open fifo: %m");
881                 return -errno;
882         }
883
884         cmsg = CMSG_FIRSTHDR(&mh);
885         cmsg->cmsg_level = SOL_SOCKET;
886         cmsg->cmsg_type = SCM_RIGHTS;
887         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
888         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
889
890         mh.msg_controllen = cmsg->cmsg_len;
891
892         /* Store away the fd in the socket, so that it stays open as
893          * long as we run the child */
894         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
895         close_nointr_nofail(fd);
896
897         if (k < 0) {
898                 log_error("Failed to send FIFO fd: %m");
899                 return -errno;
900         }
901
902         /* And now make the FIFO unavailable as /dev/kmsg... */
903         unlink(from);
904         return 0;
905 }
906
907 static int setup_hostname(void) {
908
909         if (arg_share_system)
910                 return 0;
911
912         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
913                 return -errno;
914
915         return 0;
916 }
917
918 static int setup_journal(const char *directory) {
919         sd_id128_t machine_id, this_id;
920         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
921         char *id;
922         int r;
923
924         p = strappend(directory, "/etc/machine-id");
925         if (!p)
926                 return log_oom();
927
928         r = read_one_line_file(p, &b);
929         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
930                 return 0;
931         else if (r < 0) {
932                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
933                 return r;
934         }
935
936         id = strstrip(b);
937         if (isempty(id) && arg_link_journal == LINK_AUTO)
938                 return 0;
939
940         /* Verify validity */
941         r = sd_id128_from_string(id, &machine_id);
942         if (r < 0) {
943                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
944                 return r;
945         }
946
947         r = sd_id128_get_machine(&this_id);
948         if (r < 0) {
949                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
950                 return r;
951         }
952
953         if (sd_id128_equal(machine_id, this_id)) {
954                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
955                          "Host and machine ids are equal (%s): refusing to link journals", id);
956                 if (arg_link_journal == LINK_AUTO)
957                         return 0;
958                 return
959                         -EEXIST;
960         }
961
962         if (arg_link_journal == LINK_NO)
963                 return 0;
964
965         free(p);
966         p = strappend("/var/log/journal/", id);
967         q = strjoin(directory, "/var/log/journal/", id, NULL);
968         if (!p || !q)
969                 return log_oom();
970
971         if (path_is_mount_point(p, false) > 0) {
972                 if (arg_link_journal != LINK_AUTO) {
973                         log_error("%s: already a mount point, refusing to use for journal", p);
974                         return -EEXIST;
975                 }
976
977                 return 0;
978         }
979
980         if (path_is_mount_point(q, false) > 0) {
981                 if (arg_link_journal != LINK_AUTO) {
982                         log_error("%s: already a mount point, refusing to use for journal", q);
983                         return -EEXIST;
984                 }
985
986                 return 0;
987         }
988
989         r = readlink_and_make_absolute(p, &d);
990         if (r >= 0) {
991                 if ((arg_link_journal == LINK_GUEST ||
992                      arg_link_journal == LINK_AUTO) &&
993                     path_equal(d, q)) {
994
995                         r = mkdir_p(q, 0755);
996                         if (r < 0)
997                                 log_warning("failed to create directory %s: %m", q);
998                         return 0;
999                 }
1000
1001                 if (unlink(p) < 0) {
1002                         log_error("Failed to remove symlink %s: %m", p);
1003                         return -errno;
1004                 }
1005         } else if (r == -EINVAL) {
1006
1007                 if (arg_link_journal == LINK_GUEST &&
1008                     rmdir(p) < 0) {
1009
1010                         if (errno == ENOTDIR) {
1011                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1012                                 return r;
1013                         } else {
1014                                 log_error("Failed to remove %s: %m", p);
1015                                 return -errno;
1016                         }
1017                 }
1018         } else if (r != -ENOENT) {
1019                 log_error("readlink(%s) failed: %m", p);
1020                 return r;
1021         }
1022
1023         if (arg_link_journal == LINK_GUEST) {
1024
1025                 if (symlink(q, p) < 0) {
1026                         log_error("Failed to symlink %s to %s: %m", q, p);
1027                         return -errno;
1028                 }
1029
1030                 r = mkdir_p(q, 0755);
1031                 if (r < 0)
1032                         log_warning("failed to create directory %s: %m", q);
1033                 return 0;
1034         }
1035
1036         if (arg_link_journal == LINK_HOST) {
1037                 r = mkdir_p(p, 0755);
1038                 if (r < 0) {
1039                         log_error("Failed to create %s: %m", p);
1040                         return r;
1041                 }
1042
1043         } else if (access(p, F_OK) < 0)
1044                 return 0;
1045
1046         if (dir_is_empty(q) == 0) {
1047                 log_error("%s not empty.", q);
1048                 return -ENOTEMPTY;
1049         }
1050
1051         r = mkdir_p(q, 0755);
1052         if (r < 0) {
1053                 log_error("Failed to create %s: %m", q);
1054                 return r;
1055         }
1056
1057         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1058                 log_error("Failed to bind mount journal from host into guest: %m");
1059                 return -errno;
1060         }
1061
1062         return 0;
1063 }
1064
1065 static int setup_kdbus(const char *dest, const char *path) {
1066         const char *p;
1067
1068         if (!path)
1069                 return 0;
1070
1071         p = strappenda(dest, "/dev/kdbus");
1072         if (mkdir(p, 0755) < 0) {
1073                 log_error("Failed to create kdbus path: %m");
1074                 return  -errno;
1075         }
1076
1077         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1078                 log_error("Failed to mount kdbus domain path: %m");
1079                 return -errno;
1080         }
1081
1082         return 0;
1083 }
1084
1085 static int drop_capabilities(void) {
1086         return capability_bounding_set_drop(~arg_retain, false);
1087 }
1088
1089 static int register_machine(pid_t pid) {
1090         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1091         _cleanup_bus_unref_ sd_bus *bus = NULL;
1092         int r;
1093
1094         if (!arg_register)
1095                 return 0;
1096
1097         r = sd_bus_default_system(&bus);
1098         if (r < 0) {
1099                 log_error("Failed to open system bus: %s", strerror(-r));
1100                 return r;
1101         }
1102
1103         if (arg_keep_unit) {
1104                 r = sd_bus_call_method(
1105                                 bus,
1106                                 "org.freedesktop.machine1",
1107                                 "/org/freedesktop/machine1",
1108                                 "org.freedesktop.machine1.Manager",
1109                                 "RegisterMachine",
1110                                 &error,
1111                                 NULL,
1112                                 "sayssus",
1113                                 arg_machine,
1114                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1115                                 "nspawn",
1116                                 "container",
1117                                 (uint32_t) pid,
1118                                 strempty(arg_directory));
1119         } else {
1120                 r = sd_bus_call_method(
1121                                 bus,
1122                                 "org.freedesktop.machine1",
1123                                 "/org/freedesktop/machine1",
1124                                 "org.freedesktop.machine1.Manager",
1125                                 "CreateMachine",
1126                                 &error,
1127                                 NULL,
1128                                 "sayssusa(sv)",
1129                                 arg_machine,
1130                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1131                                 "nspawn",
1132                                 "container",
1133                                 (uint32_t) pid,
1134                                 strempty(arg_directory),
1135                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1136         }
1137
1138         if (r < 0) {
1139                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1140                 return r;
1141         }
1142
1143         return 0;
1144 }
1145
1146 static int terminate_machine(pid_t pid) {
1147         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1148         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1149         _cleanup_bus_unref_ sd_bus *bus = NULL;
1150         const char *path;
1151         int r;
1152
1153         if (!arg_register)
1154                 return 0;
1155
1156         r = sd_bus_default_system(&bus);
1157         if (r < 0) {
1158                 log_error("Failed to open system bus: %s", strerror(-r));
1159                 return r;
1160         }
1161
1162         r = sd_bus_call_method(
1163                         bus,
1164                         "org.freedesktop.machine1",
1165                         "/org/freedesktop/machine1",
1166                         "org.freedesktop.machine1.Manager",
1167                         "GetMachineByPID",
1168                         &error,
1169                         &reply,
1170                         "u",
1171                         (uint32_t) pid);
1172         if (r < 0) {
1173                 /* Note that the machine might already have been
1174                  * cleaned up automatically, hence don't consider it a
1175                  * failure if we cannot get the machine object. */
1176                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1177                 return 0;
1178         }
1179
1180         r = sd_bus_message_read(reply, "o", &path);
1181         if (r < 0)
1182                 return bus_log_parse_error(r);
1183
1184         r = sd_bus_call_method(
1185                         bus,
1186                         "org.freedesktop.machine1",
1187                         path,
1188                         "org.freedesktop.machine1.Machine",
1189                         "Terminate",
1190                         &error,
1191                         NULL,
1192                         NULL);
1193         if (r < 0) {
1194                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1195                 return 0;
1196         }
1197
1198         return 0;
1199 }
1200
1201 static bool audit_enabled(void) {
1202         int fd;
1203
1204         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1205         if (fd >= 0) {
1206                 close_nointr_nofail(fd);
1207                 return true;
1208         }
1209         return false;
1210 }
1211
1212 int main(int argc, char *argv[]) {
1213         pid_t pid = 0;
1214         int r = EXIT_FAILURE, k;
1215         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1216         int n_fd_passed;
1217         const char *console = NULL;
1218         sigset_t mask;
1219         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1220         _cleanup_fdset_free_ FDSet *fds = NULL;
1221         _cleanup_free_ char *kdbus_domain = NULL;
1222
1223         log_parse_environment();
1224         log_open();
1225
1226         k = parse_argv(argc, argv);
1227         if (k < 0)
1228                 goto finish;
1229         else if (k == 0) {
1230                 r = EXIT_SUCCESS;
1231                 goto finish;
1232         }
1233
1234         if (arg_directory) {
1235                 char *p;
1236
1237                 p = path_make_absolute_cwd(arg_directory);
1238                 free(arg_directory);
1239                 arg_directory = p;
1240         } else
1241                 arg_directory = get_current_dir_name();
1242
1243         if (!arg_directory) {
1244                 log_error("Failed to determine path, please use -D.");
1245                 goto finish;
1246         }
1247
1248         path_kill_slashes(arg_directory);
1249
1250         if (!arg_machine) {
1251                 arg_machine = strdup(basename(arg_directory));
1252                 if (!arg_machine) {
1253                         log_oom();
1254                         goto finish;
1255                 }
1256
1257                 hostname_cleanup(arg_machine, false);
1258                 if (isempty(arg_machine)) {
1259                         log_error("Failed to determine machine name automatically, please use -M.");
1260                         goto finish;
1261                 }
1262         }
1263
1264         if (geteuid() != 0) {
1265                 log_error("Need to be root.");
1266                 goto finish;
1267         }
1268
1269         if (sd_booted() <= 0) {
1270                 log_error("Not running on a systemd system.");
1271                 goto finish;
1272         }
1273
1274         if (arg_boot && audit_enabled()) {
1275                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1276                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1277                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1278                 sleep(5);
1279         }
1280
1281         if (path_equal(arg_directory, "/")) {
1282                 log_error("Spawning container on root directory not supported.");
1283                 goto finish;
1284         }
1285
1286         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1287                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1288                 goto finish;
1289         }
1290
1291         log_close();
1292         n_fd_passed = sd_listen_fds(false);
1293         if (n_fd_passed > 0) {
1294                 k = fdset_new_listen_fds(&fds, false);
1295                 if (k < 0) {
1296                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1297                         goto finish;
1298                 }
1299         }
1300         fdset_close_others(fds);
1301         log_open();
1302
1303         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1304         if (master < 0) {
1305                 log_error("Failed to acquire pseudo tty: %m");
1306                 goto finish;
1307         }
1308
1309         console = ptsname(master);
1310         if (!console) {
1311                 log_error("Failed to determine tty name: %m");
1312                 goto finish;
1313         }
1314
1315         if (!arg_quiet)
1316                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1317
1318         if (unlockpt(master) < 0) {
1319                 log_error("Failed to unlock tty: %m");
1320                 goto finish;
1321         }
1322
1323
1324         if (access("/dev/kdbus/control", F_OK) >= 0) {
1325
1326                 if (arg_share_system) {
1327                         kdbus_domain = strdup("/dev/kdbus");
1328                         if (!kdbus_domain) {
1329                                 log_oom();
1330                                 goto finish;
1331                         }
1332                 } else {
1333                         const char *ns;
1334
1335                         ns = strappenda("machine-", arg_machine);
1336                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1337                         if (r < 0)
1338                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1339                         else
1340                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1341                 }
1342         }
1343
1344         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1345                 log_error("Failed to create kmsg socket pair: %m");
1346                 goto finish;
1347         }
1348
1349         sd_notify(0, "READY=1");
1350
1351         assert_se(sigemptyset(&mask) == 0);
1352         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1353         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1354
1355         for (;;) {
1356                 siginfo_t status;
1357
1358                 sync_fd = eventfd(0, EFD_CLOEXEC);
1359                 if (sync_fd < 0) {
1360                         log_error("Failed to create event fd: %m");
1361                         goto finish;
1362                 }
1363
1364                 pid = syscall(__NR_clone,
1365                               SIGCHLD|CLONE_NEWNS|
1366                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1367                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1368                 if (pid < 0) {
1369                         if (errno == EINVAL)
1370                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1371                         else
1372                                 log_error("clone() failed: %m");
1373
1374                         goto finish;
1375                 }
1376
1377                 if (pid == 0) {
1378                         /* child */
1379                         const char *home = NULL;
1380                         uid_t uid = (uid_t) -1;
1381                         gid_t gid = (gid_t) -1;
1382                         unsigned n_env = 2;
1383                         const char *envp[] = {
1384                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1385                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1386                                 NULL, /* TERM */
1387                                 NULL, /* HOME */
1388                                 NULL, /* USER */
1389                                 NULL, /* LOGNAME */
1390                                 NULL, /* container_uuid */
1391                                 NULL, /* LISTEN_FDS */
1392                                 NULL, /* LISTEN_PID */
1393                                 NULL
1394                         };
1395                         char **env_use;
1396                         eventfd_t x;
1397
1398                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1399                         if (envp[n_env])
1400                                 n_env ++;
1401
1402                         close_nointr_nofail(master);
1403                         master = -1;
1404
1405                         close_nointr(STDIN_FILENO);
1406                         close_nointr(STDOUT_FILENO);
1407                         close_nointr(STDERR_FILENO);
1408
1409                         close_nointr_nofail(kmsg_socket_pair[0]);
1410                         kmsg_socket_pair[0] = -1;
1411
1412                         reset_all_signal_handlers();
1413
1414                         assert_se(sigemptyset(&mask) == 0);
1415                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1416
1417                         k = open_terminal(console, O_RDWR);
1418                         if (k != STDIN_FILENO) {
1419                                 if (k >= 0) {
1420                                         close_nointr_nofail(k);
1421                                         k = -EINVAL;
1422                                 }
1423
1424                                 log_error("Failed to open console: %s", strerror(-k));
1425                                 goto child_fail;
1426                         }
1427
1428                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1429                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1430                                 log_error("Failed to duplicate console: %m");
1431                                 goto child_fail;
1432                         }
1433
1434                         if (setsid() < 0) {
1435                                 log_error("setsid() failed: %m");
1436                                 goto child_fail;
1437                         }
1438
1439                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1440                                 log_error("PR_SET_PDEATHSIG failed: %m");
1441                                 goto child_fail;
1442                         }
1443
1444                         /* Mark everything as slave, so that we still
1445                          * receive mounts from the real root, but don't
1446                          * propagate mounts to the real root. */
1447                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1448                                 log_error("MS_SLAVE|MS_REC failed: %m");
1449                                 goto child_fail;
1450                         }
1451
1452                         /* Turn directory into bind mount */
1453                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1454                                 log_error("Failed to make bind mount.");
1455                                 goto child_fail;
1456                         }
1457
1458                         if (arg_read_only)
1459                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1460                                         log_error("Failed to make read-only.");
1461                                         goto child_fail;
1462                                 }
1463
1464                         if (mount_all(arg_directory) < 0)
1465                                 goto child_fail;
1466
1467                         if (copy_devnodes(arg_directory) < 0)
1468                                 goto child_fail;
1469
1470                         if (setup_ptmx(arg_directory) < 0)
1471                                 goto child_fail;
1472
1473                         dev_setup(arg_directory);
1474
1475                         if (setup_dev_console(arg_directory, console) < 0)
1476                                 goto child_fail;
1477
1478                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1479                                 goto child_fail;
1480
1481                         close_nointr_nofail(kmsg_socket_pair[1]);
1482                         kmsg_socket_pair[1] = -1;
1483
1484                         if (setup_boot_id(arg_directory) < 0)
1485                                 goto child_fail;
1486
1487                         if (setup_timezone(arg_directory) < 0)
1488                                 goto child_fail;
1489
1490                         if (setup_resolv_conf(arg_directory) < 0)
1491                                 goto child_fail;
1492
1493                         if (setup_journal(arg_directory) < 0)
1494                                 goto child_fail;
1495
1496                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1497                                 goto child_fail;
1498
1499                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1500                                 goto child_fail;
1501
1502                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1503                                 goto child_fail;
1504
1505                         if (chdir(arg_directory) < 0) {
1506                                 log_error("chdir(%s) failed: %m", arg_directory);
1507                                 goto child_fail;
1508                         }
1509
1510                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1511                                 log_error("mount(MS_MOVE) failed: %m");
1512                                 goto child_fail;
1513                         }
1514
1515                         if (chroot(".") < 0) {
1516                                 log_error("chroot() failed: %m");
1517                                 goto child_fail;
1518                         }
1519
1520                         if (chdir("/") < 0) {
1521                                 log_error("chdir() failed: %m");
1522                                 goto child_fail;
1523                         }
1524
1525                         umask(0022);
1526
1527                         if (arg_private_network)
1528                                 loopback_setup();
1529
1530                         if (drop_capabilities() < 0) {
1531                                 log_error("drop_capabilities() failed: %m");
1532                                 goto child_fail;
1533                         }
1534
1535                         if (arg_user) {
1536
1537                                 /* Note that this resolves user names
1538                                  * inside the container, and hence
1539                                  * accesses the NSS modules from the
1540                                  * container and not the host. This is
1541                                  * a bit weird... */
1542
1543                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1544                                         log_error("get_user_creds() failed: %m");
1545                                         goto child_fail;
1546                                 }
1547
1548                                 if (mkdir_parents_label(home, 0775) < 0) {
1549                                         log_error("mkdir_parents_label() failed: %m");
1550                                         goto child_fail;
1551                                 }
1552
1553                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1554                                         log_error("mkdir_safe_label() failed: %m");
1555                                         goto child_fail;
1556                                 }
1557
1558                                 if (initgroups((const char*)arg_user, gid) < 0) {
1559                                         log_error("initgroups() failed: %m");
1560                                         goto child_fail;
1561                                 }
1562
1563                                 if (setresgid(gid, gid, gid) < 0) {
1564                                         log_error("setregid() failed: %m");
1565                                         goto child_fail;
1566                                 }
1567
1568                                 if (setresuid(uid, uid, uid) < 0) {
1569                                         log_error("setreuid() failed: %m");
1570                                         goto child_fail;
1571                                 }
1572                         } else {
1573                                 /* Reset everything fully to 0, just in case */
1574
1575                                 if (setgroups(0, NULL) < 0) {
1576                                         log_error("setgroups() failed: %m");
1577                                         goto child_fail;
1578                                 }
1579
1580                                 if (setresgid(0, 0, 0) < 0) {
1581                                         log_error("setregid() failed: %m");
1582                                         goto child_fail;
1583                                 }
1584
1585                                 if (setresuid(0, 0, 0) < 0) {
1586                                         log_error("setreuid() failed: %m");
1587                                         goto child_fail;
1588                                 }
1589                         }
1590
1591                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1592                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1593                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1594                                 log_oom();
1595                                 goto child_fail;
1596                         }
1597
1598                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1599                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1600                                         log_oom();
1601                                         goto child_fail;
1602                                 }
1603                         }
1604
1605                         if (fdset_size(fds) > 0) {
1606                                 k = fdset_cloexec(fds, false);
1607                                 if (k < 0) {
1608                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1609                                         goto child_fail;
1610                                 }
1611
1612                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1613                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1614                                         log_oom();
1615                                         goto child_fail;
1616                                 }
1617                         }
1618
1619                         setup_hostname();
1620
1621                         eventfd_read(sync_fd, &x);
1622                         close_nointr_nofail(sync_fd);
1623                         sync_fd = -1;
1624
1625                         if (!strv_isempty(arg_setenv)) {
1626                                 char **n;
1627
1628                                 n = strv_env_merge(2, envp, arg_setenv);
1629                                 if (!n) {
1630                                         log_oom();
1631                                         goto child_fail;
1632                                 }
1633
1634                                 env_use = n;
1635                         } else
1636                                 env_use = (char**) envp;
1637
1638 #ifdef HAVE_SELINUX
1639                         if (arg_selinux_context)
1640                                 if (setexeccon(arg_selinux_context) < 0)
1641                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1642 #endif
1643                         if (arg_boot) {
1644                                 char **a;
1645                                 size_t l;
1646
1647                                 /* Automatically search for the init system */
1648
1649                                 l = 1 + argc - optind;
1650                                 a = newa(char*, l + 1);
1651                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1652
1653                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1654                                 execve(a[0], a, env_use);
1655
1656                                 a[0] = (char*) "/lib/systemd/systemd";
1657                                 execve(a[0], a, env_use);
1658
1659                                 a[0] = (char*) "/sbin/init";
1660                                 execve(a[0], a, env_use);
1661                         } else if (argc > optind)
1662                                 execvpe(argv[optind], argv + optind, env_use);
1663                         else {
1664                                 chdir(home ? home : "/root");
1665                                 execle("/bin/bash", "-bash", NULL, env_use);
1666                         }
1667
1668                         log_error("execv() failed: %m");
1669
1670                 child_fail:
1671                         _exit(EXIT_FAILURE);
1672                 }
1673
1674                 fdset_free(fds);
1675                 fds = NULL;
1676
1677                 r = register_machine(pid);
1678                 if (r < 0)
1679                         goto finish;
1680
1681                 eventfd_write(sync_fd, 1);
1682                 close_nointr_nofail(sync_fd);
1683                 sync_fd = -1;
1684
1685                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1686                 if (k < 0) {
1687                         r = EXIT_FAILURE;
1688                         break;
1689                 }
1690
1691                 if (!arg_quiet)
1692                         putc('\n', stdout);
1693
1694                 /* Kill if it is not dead yet anyway */
1695                 terminate_machine(pid);
1696
1697                 /* Redundant, but better safe than sorry */
1698                 kill(pid, SIGKILL);
1699
1700                 k = wait_for_terminate(pid, &status);
1701                 pid = 0;
1702
1703                 if (k < 0) {
1704                         r = EXIT_FAILURE;
1705                         break;
1706                 }
1707
1708                 if (status.si_code == CLD_EXITED) {
1709                         r = status.si_status;
1710                         if (status.si_status != 0) {
1711                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1712                                 break;
1713                         }
1714
1715                         if (!arg_quiet)
1716                                 log_debug("Container %s exited successfully.", arg_machine);
1717                         break;
1718                 } else if (status.si_code == CLD_KILLED &&
1719                            status.si_status == SIGINT) {
1720
1721                         if (!arg_quiet)
1722                                 log_info("Container %s has been shut down.", arg_machine);
1723                         r = 0;
1724                         break;
1725                 } else if (status.si_code == CLD_KILLED &&
1726                            status.si_status == SIGHUP) {
1727
1728                         if (!arg_quiet)
1729                                 log_info("Container %s is being rebooted.", arg_machine);
1730                         continue;
1731                 } else if (status.si_code == CLD_KILLED ||
1732                            status.si_code == CLD_DUMPED) {
1733
1734                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1735                         r = EXIT_FAILURE;
1736                         break;
1737                 } else {
1738                         log_error("Container %s failed due to unknown reason.", arg_machine);
1739                         r = EXIT_FAILURE;
1740                         break;
1741                 }
1742         }
1743
1744 finish:
1745         if (pid > 0)
1746                 kill(pid, SIGKILL);
1747
1748         free(arg_directory);
1749         free(arg_machine);
1750         free(arg_setenv);
1751
1752         return r;
1753 }