chiark / gitweb /
d5add4a45ea44fa20a568005a3cbafa19e89ab4b
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #ifdef HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #include "sd-daemon.h"
49 #include "sd-bus.h"
50 #include "sd-id128.h"
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
62 #include "fdset.h"
63 #include "build.h"
64 #include "fileio.h"
65 #include "bus-util.h"
66 #include "bus-error.h"
67 #include "ptyfwd.h"
68 #include "bus-kernel.h"
69 #include "env-util.h"
70 #include "def.h"
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
91         (1ULL << CAP_CHOWN) |
92         (1ULL << CAP_DAC_OVERRIDE) |
93         (1ULL << CAP_DAC_READ_SEARCH) |
94         (1ULL << CAP_FOWNER) |
95         (1ULL << CAP_FSETID) |
96         (1ULL << CAP_IPC_OWNER) |
97         (1ULL << CAP_KILL) |
98         (1ULL << CAP_LEASE) |
99         (1ULL << CAP_LINUX_IMMUTABLE) |
100         (1ULL << CAP_NET_BIND_SERVICE) |
101         (1ULL << CAP_NET_BROADCAST) |
102         (1ULL << CAP_NET_RAW) |
103         (1ULL << CAP_SETGID) |
104         (1ULL << CAP_SETFCAP) |
105         (1ULL << CAP_SETPCAP) |
106         (1ULL << CAP_SETUID) |
107         (1ULL << CAP_SYS_ADMIN) |
108         (1ULL << CAP_SYS_CHROOT) |
109         (1ULL << CAP_SYS_NICE) |
110         (1ULL << CAP_SYS_PTRACE) |
111         (1ULL << CAP_SYS_TTY_CONFIG) |
112         (1ULL << CAP_SYS_RESOURCE) |
113         (1ULL << CAP_SYS_BOOT) |
114         (1ULL << CAP_AUDIT_WRITE) |
115         (1ULL << CAP_AUDIT_CONTROL) |
116         (1ULL << CAP_MKNOD);
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122 static bool arg_register = true;
123 static bool arg_keep_unit = false;
124
125 static int help(void) {
126
127         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
128                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
129                "  -h --help                 Show this help\n"
130                "     --version              Print version string\n"
131                "  -D --directory=NAME       Root directory for the container\n"
132                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
133                "  -u --user=USER            Run the command under specified user or uid\n"
134                "     --uuid=UUID            Set a specific machine UUID for the container\n"
135                "  -M --machine=NAME         Set the machine name for the container\n"
136                "  -S --slice=SLICE          Place the container in the specified slice\n"
137                "  -Z --selinux-context=SECLABEL\n"
138                "                            Set the SELinux security context to be used by\n"
139                "                            processes in the container\n"
140                "  -L --selinux-apifs-context=SECLABEL\n"
141                "                            Set the SELinux security context to be used by\n"
142                "                            API/tmpfs file systems in the container\n"
143                "     --private-network      Disable network in container\n"
144                "     --share-system         Share system namespaces with host\n"
145                "     --read-only            Mount the root directory read-only\n"
146                "     --capability=CAP       In addition to the default, retain specified\n"
147                "                            capability\n"
148                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
149                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
150                "  -j                        Equivalent to --link-journal=host\n"
151                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
152                "                            the container\n"
153                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
154                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
155                "     --register=BOOLEAN     Register container as machine\n"
156                "     --keep-unit            Do not register a scope for the machine, reuse\n"
157                "                            the service unit nspawn is running in\n"
158                "  -q --quiet                Do not show status information\n",
159                program_invocation_short_name);
160
161         return 0;
162 }
163
164 static int parse_argv(int argc, char *argv[]) {
165
166         enum {
167                 ARG_VERSION = 0x100,
168                 ARG_PRIVATE_NETWORK,
169                 ARG_UUID,
170                 ARG_READ_ONLY,
171                 ARG_CAPABILITY,
172                 ARG_DROP_CAPABILITY,
173                 ARG_LINK_JOURNAL,
174                 ARG_BIND,
175                 ARG_BIND_RO,
176                 ARG_SETENV,
177                 ARG_SHARE_SYSTEM,
178                 ARG_REGISTER,
179                 ARG_KEEP_UNIT
180         };
181
182         static const struct option options[] = {
183                 { "help",                  no_argument,       NULL, 'h'                 },
184                 { "version",               no_argument,       NULL, ARG_VERSION         },
185                 { "directory",             required_argument, NULL, 'D'                 },
186                 { "user",                  required_argument, NULL, 'u'                 },
187                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK },
188                 { "boot",                  no_argument,       NULL, 'b'                 },
189                 { "uuid",                  required_argument, NULL, ARG_UUID            },
190                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY       },
191                 { "capability",            required_argument, NULL, ARG_CAPABILITY      },
192                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY },
193                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL    },
194                 { "bind",                  required_argument, NULL, ARG_BIND            },
195                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO         },
196                 { "machine",               required_argument, NULL, 'M'                 },
197                 { "slice",                 required_argument, NULL, 'S'                 },
198                 { "setenv",                required_argument, NULL, ARG_SETENV          },
199                 { "selinux-context",       required_argument, NULL, 'Z'                 },
200                 { "selinux-apifs-context", required_argument, NULL, 'L'                 },
201                 { "quiet",                 no_argument,       NULL, 'q'                 },
202                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM    },
203                 { "register",              required_argument, NULL, ARG_REGISTER        },
204                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT       },
205                 {}
206         };
207
208         int c, r;
209
210         assert(argc >= 0);
211         assert(argv);
212
213         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
214
215                 switch (c) {
216
217                 case 'h':
218                         return help();
219
220                 case ARG_VERSION:
221                         puts(PACKAGE_STRING);
222                         puts(SYSTEMD_FEATURES);
223                         return 0;
224
225                 case 'D':
226                         free(arg_directory);
227                         arg_directory = canonicalize_file_name(optarg);
228                         if (!arg_directory) {
229                                 log_error("Invalid root directory: %m");
230                                 return -ENOMEM;
231                         }
232
233                         break;
234
235                 case 'u':
236                         free(arg_user);
237                         arg_user = strdup(optarg);
238                         if (!arg_user)
239                                 return log_oom();
240
241                         break;
242
243                 case ARG_PRIVATE_NETWORK:
244                         arg_private_network = true;
245                         break;
246
247                 case 'b':
248                         arg_boot = true;
249                         break;
250
251                 case ARG_UUID:
252                         r = sd_id128_from_string(optarg, &arg_uuid);
253                         if (r < 0) {
254                                 log_error("Invalid UUID: %s", optarg);
255                                 return r;
256                         }
257                         break;
258
259                 case 'S':
260                         arg_slice = strdup(optarg);
261                         if (!arg_slice)
262                                 return log_oom();
263
264                         break;
265
266                 case 'M':
267                         if (isempty(optarg)) {
268                                 free(arg_machine);
269                                 arg_machine = NULL;
270                         } else {
271
272                                 if (!hostname_is_valid(optarg)) {
273                                         log_error("Invalid machine name: %s", optarg);
274                                         return -EINVAL;
275                                 }
276
277                                 free(arg_machine);
278                                 arg_machine = strdup(optarg);
279                                 if (!arg_machine)
280                                         return log_oom();
281
282                                 break;
283                         }
284
285                 case 'Z':
286                         arg_selinux_context = optarg;
287                         break;
288
289                 case 'L':
290                         arg_selinux_apifs_context = optarg;
291                         break;
292
293                 case ARG_READ_ONLY:
294                         arg_read_only = true;
295                         break;
296
297                 case ARG_CAPABILITY:
298                 case ARG_DROP_CAPABILITY: {
299                         char *state, *word;
300                         size_t length;
301
302                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
303                                 cap_value_t cap;
304                                 char *t;
305
306                                 t = strndup(word, length);
307                                 if (!t)
308                                         return log_oom();
309
310                                 if (cap_from_name(t, &cap) < 0) {
311                                         log_error("Failed to parse capability %s.", t);
312                                         free(t);
313                                         return -EINVAL;
314                                 }
315
316                                 free(t);
317
318                                 if (c == ARG_CAPABILITY)
319                                         arg_retain |= 1ULL << (uint64_t) cap;
320                                 else
321                                         arg_retain &= ~(1ULL << (uint64_t) cap);
322                         }
323
324                         break;
325                 }
326
327                 case 'j':
328                         arg_link_journal = LINK_GUEST;
329                         break;
330
331                 case ARG_LINK_JOURNAL:
332                         if (streq(optarg, "auto"))
333                                 arg_link_journal = LINK_AUTO;
334                         else if (streq(optarg, "no"))
335                                 arg_link_journal = LINK_NO;
336                         else if (streq(optarg, "guest"))
337                                 arg_link_journal = LINK_GUEST;
338                         else if (streq(optarg, "host"))
339                                 arg_link_journal = LINK_HOST;
340                         else {
341                                 log_error("Failed to parse link journal mode %s", optarg);
342                                 return -EINVAL;
343                         }
344
345                         break;
346
347                 case ARG_BIND:
348                 case ARG_BIND_RO: {
349                         _cleanup_free_ char *a = NULL, *b = NULL;
350                         char *e;
351                         char ***x;
352
353                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
354
355                         e = strchr(optarg, ':');
356                         if (e) {
357                                 a = strndup(optarg, e - optarg);
358                                 b = strdup(e + 1);
359                         } else {
360                                 a = strdup(optarg);
361                                 b = strdup(optarg);
362                         }
363
364                         if (!a || !b)
365                                 return log_oom();
366
367                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
368                                 log_error("Invalid bind mount specification: %s", optarg);
369                                 return -EINVAL;
370                         }
371
372                         r = strv_extend(x, a);
373                         if (r < 0)
374                                 return log_oom();
375
376                         r = strv_extend(x, b);
377                         if (r < 0)
378                                 return log_oom();
379
380                         break;
381                 }
382
383                 case ARG_SETENV: {
384                         char **n;
385
386                         if (!env_assignment_is_valid(optarg)) {
387                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
388                                 return -EINVAL;
389                         }
390
391                         n = strv_env_set(arg_setenv, optarg);
392                         if (!n)
393                                 return log_oom();
394
395                         strv_free(arg_setenv);
396                         arg_setenv = n;
397                         break;
398                 }
399
400                 case 'q':
401                         arg_quiet = true;
402                         break;
403
404                 case ARG_SHARE_SYSTEM:
405                         arg_share_system = true;
406                         break;
407
408                 case ARG_REGISTER:
409                         r = parse_boolean(optarg);
410                         if (r < 0) {
411                                 log_error("Failed to parse --register= argument: %s", optarg);
412                                 return r;
413                         }
414
415                         arg_register = r;
416                         break;
417
418                 case ARG_KEEP_UNIT:
419                         arg_keep_unit = true;
420                         break;
421
422                 case '?':
423                         return -EINVAL;
424
425                 default:
426                         assert_not_reached("Unhandled option");
427                 }
428         }
429
430         if (arg_share_system)
431                 arg_register = false;
432
433         if (arg_boot && arg_share_system) {
434                 log_error("--boot and --share-system may not be combined.");
435                 return -EINVAL;
436         }
437
438         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
439                 log_error("--keep-unit may not be used when invoked from a user session.");
440                 return -EINVAL;
441         }
442
443         return 1;
444 }
445
446 static int mount_all(const char *dest) {
447
448         typedef struct MountPoint {
449                 const char *what;
450                 const char *where;
451                 const char *type;
452                 const char *options;
453                 unsigned long flags;
454                 bool fatal;
455         } MountPoint;
456
457         static const MountPoint mount_table[] = {
458                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
459                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
460                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
461                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
462                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
463                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
464                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
465                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
466 #ifdef HAVE_SELINUX
467                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
468                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
469 #endif
470         };
471
472         unsigned k;
473         int r = 0;
474
475         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
476                 _cleanup_free_ char *where = NULL;
477 #ifdef HAVE_SELINUX
478                 _cleanup_free_ char *options = NULL;
479 #endif
480                 const char *o;
481                 int t;
482
483                 where = strjoin(dest, "/", mount_table[k].where, NULL);
484                 if (!where)
485                         return log_oom();
486
487                 t = path_is_mount_point(where, true);
488                 if (t < 0) {
489                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
490
491                         if (r == 0)
492                                 r = t;
493
494                         continue;
495                 }
496
497                 /* Skip this entry if it is not a remount. */
498                 if (mount_table[k].what && t > 0)
499                         continue;
500
501                 mkdir_p(where, 0755);
502
503 #ifdef HAVE_SELINUX
504                 if (arg_selinux_apifs_context &&
505                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
506                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
507                         if (!options)
508                                 return log_oom();
509
510                         o = options;
511                 } else
512 #endif
513                         o = mount_table[k].options;
514
515
516                 if (mount(mount_table[k].what,
517                           where,
518                           mount_table[k].type,
519                           mount_table[k].flags,
520                           o) < 0 &&
521                     mount_table[k].fatal) {
522
523                         log_error("mount(%s) failed: %m", where);
524
525                         if (r == 0)
526                                 r = -errno;
527                 }
528         }
529
530         return r;
531 }
532
533 static int mount_binds(const char *dest, char **l, unsigned long flags) {
534         char **x, **y;
535
536         STRV_FOREACH_PAIR(x, y, l) {
537                 char *where;
538                 struct stat source_st, dest_st;
539                 int r;
540
541                 if (stat(*x, &source_st) < 0) {
542                         log_error("failed to stat %s: %m", *x);
543                         return -errno;
544                 }
545
546                 where = strappenda(dest, *y);
547                 r = stat(where, &dest_st);
548                 if (r == 0) {
549                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
550                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
551                                                 *x, where);
552                                 return -EINVAL;
553                         }
554                 } else if (errno == ENOENT) {
555                         r = mkdir_parents_label(where, 0755);
556                         if (r < 0) {
557                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
558                                 return r;
559                         }
560                 } else {
561                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
562                         return -errno;
563                 }
564                 /* Create the mount point, but be conservative -- refuse to create block
565                 * and char devices. */
566                 if (S_ISDIR(source_st.st_mode))
567                         mkdir_label(where, 0755);
568                 else if (S_ISFIFO(source_st.st_mode))
569                         mkfifo(where, 0644);
570                 else if (S_ISSOCK(source_st.st_mode))
571                         mknod(where, 0644 | S_IFSOCK, 0);
572                 else if (S_ISREG(source_st.st_mode))
573                         touch(where);
574                 else {
575                         log_error("Refusing to create mountpoint for file: %s", *x);
576                         return -ENOTSUP;
577                 }
578
579                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
580                         log_error("mount(%s) failed: %m", where);
581                         return -errno;
582                 }
583
584                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
585                         log_error("mount(%s) failed: %m", where);
586                         return -errno;
587                 }
588         }
589
590         return 0;
591 }
592
593 static int setup_timezone(const char *dest) {
594         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
595         char *z, *y;
596         int r;
597
598         assert(dest);
599
600         /* Fix the timezone, if possible */
601         r = readlink_malloc("/etc/localtime", &p);
602         if (r < 0) {
603                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
604                 return 0;
605         }
606
607         z = path_startswith(p, "../usr/share/zoneinfo/");
608         if (!z)
609                 z = path_startswith(p, "/usr/share/zoneinfo/");
610         if (!z) {
611                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
612                 return 0;
613         }
614
615         where = strappend(dest, "/etc/localtime");
616         if (!where)
617                 return log_oom();
618
619         r = readlink_malloc(where, &q);
620         if (r >= 0) {
621                 y = path_startswith(q, "../usr/share/zoneinfo/");
622                 if (!y)
623                         y = path_startswith(q, "/usr/share/zoneinfo/");
624
625
626                 /* Already pointing to the right place? Then do nothing .. */
627                 if (y && streq(y, z))
628                         return 0;
629         }
630
631         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
632         if (!check)
633                 return log_oom();
634
635         if (access(check, F_OK) < 0) {
636                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
637                 return 0;
638         }
639
640         what = strappend("../usr/share/zoneinfo/", z);
641         if (!what)
642                 return log_oom();
643
644         unlink(where);
645         if (symlink(what, where) < 0) {
646                 log_error("Failed to correct timezone of container: %m");
647                 return 0;
648         }
649
650         return 0;
651 }
652
653 static int setup_resolv_conf(const char *dest) {
654         char _cleanup_free_ *where = NULL;
655
656         assert(dest);
657
658         if (arg_private_network)
659                 return 0;
660
661         /* Fix resolv.conf, if possible */
662         where = strappend(dest, "/etc/resolv.conf");
663         if (!where)
664                 return log_oom();
665
666         /* We don't really care for the results of this really. If it
667          * fails, it fails, but meh... */
668         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
669
670         return 0;
671 }
672
673 static int setup_boot_id(const char *dest) {
674         _cleanup_free_ char *from = NULL, *to = NULL;
675         sd_id128_t rnd;
676         char as_uuid[37];
677         int r;
678
679         assert(dest);
680
681         if (arg_share_system)
682                 return 0;
683
684         /* Generate a new randomized boot ID, so that each boot-up of
685          * the container gets a new one */
686
687         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
688         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
689         if (!from || !to)
690                 return log_oom();
691
692         r = sd_id128_randomize(&rnd);
693         if (r < 0) {
694                 log_error("Failed to generate random boot id: %s", strerror(-r));
695                 return r;
696         }
697
698         snprintf(as_uuid, sizeof(as_uuid),
699                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
700                  SD_ID128_FORMAT_VAL(rnd));
701         char_array_0(as_uuid);
702
703         r = write_string_file(from, as_uuid);
704         if (r < 0) {
705                 log_error("Failed to write boot id: %s", strerror(-r));
706                 return r;
707         }
708
709         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
710                 log_error("Failed to bind mount boot id: %m");
711                 r = -errno;
712         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
713                 log_warning("Failed to make boot id read-only: %m");
714
715         unlink(from);
716         return r;
717 }
718
719 static int copy_devnodes(const char *dest) {
720
721         static const char devnodes[] =
722                 "null\0"
723                 "zero\0"
724                 "full\0"
725                 "random\0"
726                 "urandom\0"
727                 "tty\0";
728
729         const char *d;
730         int r = 0;
731         _cleanup_umask_ mode_t u;
732
733         assert(dest);
734
735         u = umask(0000);
736
737         NULSTR_FOREACH(d, devnodes) {
738                 _cleanup_free_ char *from = NULL, *to = NULL;
739                 struct stat st;
740
741                 from = strappend("/dev/", d);
742                 to = strjoin(dest, "/dev/", d, NULL);
743                 if (!from || !to)
744                         return log_oom();
745
746                 if (stat(from, &st) < 0) {
747
748                         if (errno != ENOENT) {
749                                 log_error("Failed to stat %s: %m", from);
750                                 return -errno;
751                         }
752
753                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
754
755                         log_error("%s is not a char or block device, cannot copy", from);
756                         return -EIO;
757
758                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
759
760                         log_error("mknod(%s) failed: %m", dest);
761                         return  -errno;
762                 }
763         }
764
765         return r;
766 }
767
768 static int setup_ptmx(const char *dest) {
769         _cleanup_free_ char *p = NULL;
770
771         p = strappend(dest, "/dev/ptmx");
772         if (!p)
773                 return log_oom();
774
775         if (symlink("pts/ptmx", p) < 0) {
776                 log_error("Failed to create /dev/ptmx symlink: %m");
777                 return -errno;
778         }
779
780         return 0;
781 }
782
783 static int setup_dev_console(const char *dest, const char *console) {
784         struct stat st;
785         _cleanup_free_ char *to = NULL;
786         int r;
787         _cleanup_umask_ mode_t u;
788
789         assert(dest);
790         assert(console);
791
792         u = umask(0000);
793
794         if (stat(console, &st) < 0) {
795                 log_error("Failed to stat %s: %m", console);
796                 return -errno;
797
798         } else if (!S_ISCHR(st.st_mode)) {
799                 log_error("/dev/console is not a char device");
800                 return -EIO;
801         }
802
803         r = chmod_and_chown(console, 0600, 0, 0);
804         if (r < 0) {
805                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
806                 return r;
807         }
808
809         if (asprintf(&to, "%s/dev/console", dest) < 0)
810                 return log_oom();
811
812         /* We need to bind mount the right tty to /dev/console since
813          * ptys can only exist on pts file systems. To have something
814          * to bind mount things on we create a device node first, that
815          * has the right major/minor (note that the major minor
816          * doesn't actually matter here, since we mount it over
817          * anyway). */
818
819         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
820                 log_error("mknod() for /dev/console failed: %m");
821                 return -errno;
822         }
823
824         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
825                 log_error("Bind mount for /dev/console failed: %m");
826                 return -errno;
827         }
828
829         return 0;
830 }
831
832 static int setup_kmsg(const char *dest, int kmsg_socket) {
833         _cleanup_free_ char *from = NULL, *to = NULL;
834         int r, fd, k;
835         _cleanup_umask_ mode_t u;
836         union {
837                 struct cmsghdr cmsghdr;
838                 uint8_t buf[CMSG_SPACE(sizeof(int))];
839         } control = {};
840         struct msghdr mh = {
841                 .msg_control = &control,
842                 .msg_controllen = sizeof(control),
843         };
844         struct cmsghdr *cmsg;
845
846         assert(dest);
847         assert(kmsg_socket >= 0);
848
849         u = umask(0000);
850
851         /* We create the kmsg FIFO as /dev/kmsg, but immediately
852          * delete it after bind mounting it to /proc/kmsg. While FIFOs
853          * on the reading side behave very similar to /proc/kmsg,
854          * their writing side behaves differently from /dev/kmsg in
855          * that writing blocks when nothing is reading. In order to
856          * avoid any problems with containers deadlocking due to this
857          * we simply make /dev/kmsg unavailable to the container. */
858         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
859             asprintf(&to, "%s/proc/kmsg", dest) < 0)
860                 return log_oom();
861
862         if (mkfifo(from, 0600) < 0) {
863                 log_error("mkfifo() for /dev/kmsg failed: %m");
864                 return -errno;
865         }
866
867         r = chmod_and_chown(from, 0600, 0, 0);
868         if (r < 0) {
869                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
870                 return r;
871         }
872
873         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
874                 log_error("Bind mount for /proc/kmsg failed: %m");
875                 return -errno;
876         }
877
878         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
879         if (fd < 0) {
880                 log_error("Failed to open fifo: %m");
881                 return -errno;
882         }
883
884         cmsg = CMSG_FIRSTHDR(&mh);
885         cmsg->cmsg_level = SOL_SOCKET;
886         cmsg->cmsg_type = SCM_RIGHTS;
887         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
888         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
889
890         mh.msg_controllen = cmsg->cmsg_len;
891
892         /* Store away the fd in the socket, so that it stays open as
893          * long as we run the child */
894         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
895         close_nointr_nofail(fd);
896
897         if (k < 0) {
898                 log_error("Failed to send FIFO fd: %m");
899                 return -errno;
900         }
901
902         /* And now make the FIFO unavailable as /dev/kmsg... */
903         unlink(from);
904         return 0;
905 }
906
907 static int setup_hostname(void) {
908
909         if (arg_share_system)
910                 return 0;
911
912         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
913                 return -errno;
914
915         return 0;
916 }
917
918 static int setup_journal(const char *directory) {
919         sd_id128_t machine_id, this_id;
920         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
921         char *id;
922         int r;
923
924         p = strappend(directory, "/etc/machine-id");
925         if (!p)
926                 return log_oom();
927
928         r = read_one_line_file(p, &b);
929         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
930                 return 0;
931         else if (r < 0) {
932                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
933                 return r;
934         }
935
936         id = strstrip(b);
937         if (isempty(id) && arg_link_journal == LINK_AUTO)
938                 return 0;
939
940         /* Verify validity */
941         r = sd_id128_from_string(id, &machine_id);
942         if (r < 0) {
943                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
944                 return r;
945         }
946
947         r = sd_id128_get_machine(&this_id);
948         if (r < 0) {
949                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
950                 return r;
951         }
952
953         if (sd_id128_equal(machine_id, this_id)) {
954                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
955                          "Host and machine ids are equal (%s): refusing to link journals", id);
956                 if (arg_link_journal == LINK_AUTO)
957                         return 0;
958                 return
959                         -EEXIST;
960         }
961
962         if (arg_link_journal == LINK_NO)
963                 return 0;
964
965         free(p);
966         p = strappend("/var/log/journal/", id);
967         q = strjoin(directory, "/var/log/journal/", id, NULL);
968         if (!p || !q)
969                 return log_oom();
970
971         if (path_is_mount_point(p, false) > 0) {
972                 if (arg_link_journal != LINK_AUTO) {
973                         log_error("%s: already a mount point, refusing to use for journal", p);
974                         return -EEXIST;
975                 }
976
977                 return 0;
978         }
979
980         if (path_is_mount_point(q, false) > 0) {
981                 if (arg_link_journal != LINK_AUTO) {
982                         log_error("%s: already a mount point, refusing to use for journal", q);
983                         return -EEXIST;
984                 }
985
986                 return 0;
987         }
988
989         r = readlink_and_make_absolute(p, &d);
990         if (r >= 0) {
991                 if ((arg_link_journal == LINK_GUEST ||
992                      arg_link_journal == LINK_AUTO) &&
993                     path_equal(d, q)) {
994
995                         r = mkdir_p(q, 0755);
996                         if (r < 0)
997                                 log_warning("failed to create directory %s: %m", q);
998                         return 0;
999                 }
1000
1001                 if (unlink(p) < 0) {
1002                         log_error("Failed to remove symlink %s: %m", p);
1003                         return -errno;
1004                 }
1005         } else if (r == -EINVAL) {
1006
1007                 if (arg_link_journal == LINK_GUEST &&
1008                     rmdir(p) < 0) {
1009
1010                         if (errno == ENOTDIR) {
1011                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1012                                 return r;
1013                         } else {
1014                                 log_error("Failed to remove %s: %m", p);
1015                                 return -errno;
1016                         }
1017                 }
1018         } else if (r != -ENOENT) {
1019                 log_error("readlink(%s) failed: %m", p);
1020                 return r;
1021         }
1022
1023         if (arg_link_journal == LINK_GUEST) {
1024
1025                 if (symlink(q, p) < 0) {
1026                         log_error("Failed to symlink %s to %s: %m", q, p);
1027                         return -errno;
1028                 }
1029
1030                 r = mkdir_p(q, 0755);
1031                 if (r < 0)
1032                         log_warning("failed to create directory %s: %m", q);
1033                 return 0;
1034         }
1035
1036         if (arg_link_journal == LINK_HOST) {
1037                 r = mkdir_p(p, 0755);
1038                 if (r < 0) {
1039                         log_error("Failed to create %s: %m", p);
1040                         return r;
1041                 }
1042
1043         } else if (access(p, F_OK) < 0)
1044                 return 0;
1045
1046         if (dir_is_empty(q) == 0) {
1047                 log_error("%s not empty.", q);
1048                 return -ENOTEMPTY;
1049         }
1050
1051         r = mkdir_p(q, 0755);
1052         if (r < 0) {
1053                 log_error("Failed to create %s: %m", q);
1054                 return r;
1055         }
1056
1057         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1058                 log_error("Failed to bind mount journal from host into guest: %m");
1059                 return -errno;
1060         }
1061
1062         return 0;
1063 }
1064
1065 static int setup_kdbus(const char *dest, const char *path) {
1066         const char *p;
1067
1068         if (!path)
1069                 return 0;
1070
1071         p = strappenda(dest, "/dev/kdbus");
1072         if (mkdir(p, 0755) < 0) {
1073                 log_error("Failed to create kdbus path: %m");
1074                 return  -errno;
1075         }
1076
1077         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1078                 log_error("Failed to mount kdbus domain path: %m");
1079                 return -errno;
1080         }
1081
1082         return 0;
1083 }
1084
1085 static int drop_capabilities(void) {
1086         return capability_bounding_set_drop(~arg_retain, false);
1087 }
1088
1089 static int register_machine(pid_t pid) {
1090         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1091         _cleanup_bus_unref_ sd_bus *bus = NULL;
1092         int r;
1093
1094         if (!arg_register)
1095                 return 0;
1096
1097         r = sd_bus_default_system(&bus);
1098         if (r < 0) {
1099                 log_error("Failed to open system bus: %s", strerror(-r));
1100                 return r;
1101         }
1102
1103         if (arg_keep_unit) {
1104                 r = sd_bus_call_method(
1105                                 bus,
1106                                 "org.freedesktop.machine1",
1107                                 "/org/freedesktop/machine1",
1108                                 "org.freedesktop.machine1.Manager",
1109                                 "RegisterMachine",
1110                                 &error,
1111                                 NULL,
1112                                 "sayssus",
1113                                 arg_machine,
1114                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1115                                 "nspawn",
1116                                 "container",
1117                                 (uint32_t) pid,
1118                                 strempty(arg_directory));
1119         } else {
1120                 r = sd_bus_call_method(
1121                                 bus,
1122                                 "org.freedesktop.machine1",
1123                                 "/org/freedesktop/machine1",
1124                                 "org.freedesktop.machine1.Manager",
1125                                 "CreateMachine",
1126                                 &error,
1127                                 NULL,
1128                                 "sayssusa(sv)",
1129                                 arg_machine,
1130                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1131                                 "nspawn",
1132                                 "container",
1133                                 (uint32_t) pid,
1134                                 strempty(arg_directory),
1135                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1136         }
1137
1138         if (r < 0) {
1139                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1140                 return r;
1141         }
1142
1143         return 0;
1144 }
1145
1146 static int terminate_machine(pid_t pid) {
1147         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1148         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1149         _cleanup_bus_unref_ sd_bus *bus = NULL;
1150         const char *path;
1151         int r;
1152
1153         if (!arg_register)
1154                 return 0;
1155
1156         r = sd_bus_default_system(&bus);
1157         if (r < 0) {
1158                 log_error("Failed to open system bus: %s", strerror(-r));
1159                 return r;
1160         }
1161
1162         r = sd_bus_call_method(
1163                         bus,
1164                         "org.freedesktop.machine1",
1165                         "/org/freedesktop/machine1",
1166                         "org.freedesktop.machine1.Manager",
1167                         "GetMachineByPID",
1168                         &error,
1169                         &reply,
1170                         "u",
1171                         (uint32_t) pid);
1172         if (r < 0) {
1173                 /* Note that the machine might already have been
1174                  * cleaned up automatically, hence don't consider it a
1175                  * failure if we cannot get the machine object. */
1176                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1177                 return 0;
1178         }
1179
1180         r = sd_bus_message_read(reply, "o", &path);
1181         if (r < 0)
1182                 return bus_log_parse_error(r);
1183
1184         r = sd_bus_call_method(
1185                         bus,
1186                         "org.freedesktop.machine1",
1187                         path,
1188                         "org.freedesktop.machine1.Machine",
1189                         "Terminate",
1190                         &error,
1191                         NULL,
1192                         NULL);
1193         if (r < 0) {
1194                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1195                 return 0;
1196         }
1197
1198         return 0;
1199 }
1200
1201 static int reset_audit_loginuid(void) {
1202         _cleanup_free_ char *p = NULL;
1203         int r;
1204
1205         if (arg_share_system)
1206                 return 0;
1207
1208         r = read_one_line_file("/proc/self/loginuid", &p);
1209         if (r == -EEXIST)
1210                 return 0;
1211         if (r < 0) {
1212                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1213                 return r;
1214         }
1215
1216         /* Already reset? */
1217         if (streq(p, "4294967295"))
1218                 return 0;
1219
1220         r = write_string_file("/proc/self/loginuid", "4294967295");
1221         if (r < 0) {
1222                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1223                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1224                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1225                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1226                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1227
1228                 sleep(5);
1229         }
1230
1231         return 0;
1232 }
1233
1234 int main(int argc, char *argv[]) {
1235         pid_t pid = 0;
1236         int r = EXIT_FAILURE, k;
1237         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1238         int n_fd_passed;
1239         const char *console = NULL;
1240         sigset_t mask;
1241         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1242         _cleanup_fdset_free_ FDSet *fds = NULL;
1243         _cleanup_free_ char *kdbus_domain = NULL;
1244
1245         log_parse_environment();
1246         log_open();
1247
1248         k = parse_argv(argc, argv);
1249         if (k < 0)
1250                 goto finish;
1251         else if (k == 0) {
1252                 r = EXIT_SUCCESS;
1253                 goto finish;
1254         }
1255
1256         if (arg_directory) {
1257                 char *p;
1258
1259                 p = path_make_absolute_cwd(arg_directory);
1260                 free(arg_directory);
1261                 arg_directory = p;
1262         } else
1263                 arg_directory = get_current_dir_name();
1264
1265         if (!arg_directory) {
1266                 log_error("Failed to determine path, please use -D.");
1267                 goto finish;
1268         }
1269
1270         path_kill_slashes(arg_directory);
1271
1272         if (!arg_machine) {
1273                 arg_machine = strdup(basename(arg_directory));
1274                 if (!arg_machine) {
1275                         log_oom();
1276                         goto finish;
1277                 }
1278
1279                 hostname_cleanup(arg_machine, false);
1280                 if (isempty(arg_machine)) {
1281                         log_error("Failed to determine machine name automatically, please use -M.");
1282                         goto finish;
1283                 }
1284         }
1285
1286         if (geteuid() != 0) {
1287                 log_error("Need to be root.");
1288                 goto finish;
1289         }
1290
1291         if (sd_booted() <= 0) {
1292                 log_error("Not running on a systemd system.");
1293                 goto finish;
1294         }
1295
1296         if (path_equal(arg_directory, "/")) {
1297                 log_error("Spawning container on root directory not supported.");
1298                 goto finish;
1299         }
1300
1301         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1302                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1303                 goto finish;
1304         }
1305
1306         log_close();
1307         n_fd_passed = sd_listen_fds(false);
1308         if (n_fd_passed > 0) {
1309                 k = fdset_new_listen_fds(&fds, false);
1310                 if (k < 0) {
1311                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1312                         goto finish;
1313                 }
1314         }
1315         fdset_close_others(fds);
1316         log_open();
1317
1318         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1319         if (master < 0) {
1320                 log_error("Failed to acquire pseudo tty: %m");
1321                 goto finish;
1322         }
1323
1324         console = ptsname(master);
1325         if (!console) {
1326                 log_error("Failed to determine tty name: %m");
1327                 goto finish;
1328         }
1329
1330         if (!arg_quiet)
1331                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1332
1333         if (unlockpt(master) < 0) {
1334                 log_error("Failed to unlock tty: %m");
1335                 goto finish;
1336         }
1337
1338
1339         if (access("/dev/kdbus/control", F_OK) >= 0) {
1340
1341                 if (arg_share_system) {
1342                         kdbus_domain = strdup("/dev/kdbus");
1343                         if (!kdbus_domain) {
1344                                 log_oom();
1345                                 goto finish;
1346                         }
1347                 } else {
1348                         const char *ns;
1349
1350                         ns = strappenda("machine-", arg_machine);
1351                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1352                         if (r < 0)
1353                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1354                         else
1355                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1356                 }
1357         }
1358
1359         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1360                 log_error("Failed to create kmsg socket pair: %m");
1361                 goto finish;
1362         }
1363
1364         sd_notify(0, "READY=1");
1365
1366         assert_se(sigemptyset(&mask) == 0);
1367         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1368         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1369
1370         for (;;) {
1371                 siginfo_t status;
1372
1373                 sync_fd = eventfd(0, EFD_CLOEXEC);
1374                 if (sync_fd < 0) {
1375                         log_error("Failed to create event fd: %m");
1376                         goto finish;
1377                 }
1378
1379                 pid = syscall(__NR_clone,
1380                               SIGCHLD|CLONE_NEWNS|
1381                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1382                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1383                 if (pid < 0) {
1384                         if (errno == EINVAL)
1385                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1386                         else
1387                                 log_error("clone() failed: %m");
1388
1389                         goto finish;
1390                 }
1391
1392                 if (pid == 0) {
1393                         /* child */
1394                         const char *home = NULL;
1395                         uid_t uid = (uid_t) -1;
1396                         gid_t gid = (gid_t) -1;
1397                         unsigned n_env = 2;
1398                         const char *envp[] = {
1399                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1400                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1401                                 NULL, /* TERM */
1402                                 NULL, /* HOME */
1403                                 NULL, /* USER */
1404                                 NULL, /* LOGNAME */
1405                                 NULL, /* container_uuid */
1406                                 NULL, /* LISTEN_FDS */
1407                                 NULL, /* LISTEN_PID */
1408                                 NULL
1409                         };
1410                         char **env_use;
1411                         eventfd_t x;
1412
1413                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1414                         if (envp[n_env])
1415                                 n_env ++;
1416
1417                         close_nointr_nofail(master);
1418                         master = -1;
1419
1420                         close_nointr(STDIN_FILENO);
1421                         close_nointr(STDOUT_FILENO);
1422                         close_nointr(STDERR_FILENO);
1423
1424                         close_nointr_nofail(kmsg_socket_pair[0]);
1425                         kmsg_socket_pair[0] = -1;
1426
1427                         reset_all_signal_handlers();
1428
1429                         assert_se(sigemptyset(&mask) == 0);
1430                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1431
1432                         k = open_terminal(console, O_RDWR);
1433                         if (k != STDIN_FILENO) {
1434                                 if (k >= 0) {
1435                                         close_nointr_nofail(k);
1436                                         k = -EINVAL;
1437                                 }
1438
1439                                 log_error("Failed to open console: %s", strerror(-k));
1440                                 goto child_fail;
1441                         }
1442
1443                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1444                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1445                                 log_error("Failed to duplicate console: %m");
1446                                 goto child_fail;
1447                         }
1448
1449                         if (setsid() < 0) {
1450                                 log_error("setsid() failed: %m");
1451                                 goto child_fail;
1452                         }
1453
1454                         if (reset_audit_loginuid() < 0)
1455                                 goto child_fail;
1456
1457                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1458                                 log_error("PR_SET_PDEATHSIG failed: %m");
1459                                 goto child_fail;
1460                         }
1461
1462                         /* Mark everything as slave, so that we still
1463                          * receive mounts from the real root, but don't
1464                          * propagate mounts to the real root. */
1465                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1466                                 log_error("MS_SLAVE|MS_REC failed: %m");
1467                                 goto child_fail;
1468                         }
1469
1470                         /* Turn directory into bind mount */
1471                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1472                                 log_error("Failed to make bind mount.");
1473                                 goto child_fail;
1474                         }
1475
1476                         if (arg_read_only)
1477                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1478                                         log_error("Failed to make read-only.");
1479                                         goto child_fail;
1480                                 }
1481
1482                         if (mount_all(arg_directory) < 0)
1483                                 goto child_fail;
1484
1485                         if (copy_devnodes(arg_directory) < 0)
1486                                 goto child_fail;
1487
1488                         if (setup_ptmx(arg_directory) < 0)
1489                                 goto child_fail;
1490
1491                         dev_setup(arg_directory);
1492
1493                         if (setup_dev_console(arg_directory, console) < 0)
1494                                 goto child_fail;
1495
1496                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1497                                 goto child_fail;
1498
1499                         close_nointr_nofail(kmsg_socket_pair[1]);
1500                         kmsg_socket_pair[1] = -1;
1501
1502                         if (setup_boot_id(arg_directory) < 0)
1503                                 goto child_fail;
1504
1505                         if (setup_timezone(arg_directory) < 0)
1506                                 goto child_fail;
1507
1508                         if (setup_resolv_conf(arg_directory) < 0)
1509                                 goto child_fail;
1510
1511                         if (setup_journal(arg_directory) < 0)
1512                                 goto child_fail;
1513
1514                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1515                                 goto child_fail;
1516
1517                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1518                                 goto child_fail;
1519
1520                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1521                                 goto child_fail;
1522
1523                         if (chdir(arg_directory) < 0) {
1524                                 log_error("chdir(%s) failed: %m", arg_directory);
1525                                 goto child_fail;
1526                         }
1527
1528                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1529                                 log_error("mount(MS_MOVE) failed: %m");
1530                                 goto child_fail;
1531                         }
1532
1533                         if (chroot(".") < 0) {
1534                                 log_error("chroot() failed: %m");
1535                                 goto child_fail;
1536                         }
1537
1538                         if (chdir("/") < 0) {
1539                                 log_error("chdir() failed: %m");
1540                                 goto child_fail;
1541                         }
1542
1543                         umask(0022);
1544
1545                         if (arg_private_network)
1546                                 loopback_setup();
1547
1548                         if (drop_capabilities() < 0) {
1549                                 log_error("drop_capabilities() failed: %m");
1550                                 goto child_fail;
1551                         }
1552
1553                         if (arg_user) {
1554
1555                                 /* Note that this resolves user names
1556                                  * inside the container, and hence
1557                                  * accesses the NSS modules from the
1558                                  * container and not the host. This is
1559                                  * a bit weird... */
1560
1561                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1562                                         log_error("get_user_creds() failed: %m");
1563                                         goto child_fail;
1564                                 }
1565
1566                                 if (mkdir_parents_label(home, 0775) < 0) {
1567                                         log_error("mkdir_parents_label() failed: %m");
1568                                         goto child_fail;
1569                                 }
1570
1571                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1572                                         log_error("mkdir_safe_label() failed: %m");
1573                                         goto child_fail;
1574                                 }
1575
1576                                 if (initgroups((const char*)arg_user, gid) < 0) {
1577                                         log_error("initgroups() failed: %m");
1578                                         goto child_fail;
1579                                 }
1580
1581                                 if (setresgid(gid, gid, gid) < 0) {
1582                                         log_error("setregid() failed: %m");
1583                                         goto child_fail;
1584                                 }
1585
1586                                 if (setresuid(uid, uid, uid) < 0) {
1587                                         log_error("setreuid() failed: %m");
1588                                         goto child_fail;
1589                                 }
1590                         } else {
1591                                 /* Reset everything fully to 0, just in case */
1592
1593                                 if (setgroups(0, NULL) < 0) {
1594                                         log_error("setgroups() failed: %m");
1595                                         goto child_fail;
1596                                 }
1597
1598                                 if (setresgid(0, 0, 0) < 0) {
1599                                         log_error("setregid() failed: %m");
1600                                         goto child_fail;
1601                                 }
1602
1603                                 if (setresuid(0, 0, 0) < 0) {
1604                                         log_error("setreuid() failed: %m");
1605                                         goto child_fail;
1606                                 }
1607                         }
1608
1609                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1610                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1611                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1612                                 log_oom();
1613                                 goto child_fail;
1614                         }
1615
1616                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1617                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1618                                         log_oom();
1619                                         goto child_fail;
1620                                 }
1621                         }
1622
1623                         if (fdset_size(fds) > 0) {
1624                                 k = fdset_cloexec(fds, false);
1625                                 if (k < 0) {
1626                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1627                                         goto child_fail;
1628                                 }
1629
1630                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1631                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1632                                         log_oom();
1633                                         goto child_fail;
1634                                 }
1635                         }
1636
1637                         setup_hostname();
1638
1639                         eventfd_read(sync_fd, &x);
1640                         close_nointr_nofail(sync_fd);
1641                         sync_fd = -1;
1642
1643                         if (!strv_isempty(arg_setenv)) {
1644                                 char **n;
1645
1646                                 n = strv_env_merge(2, envp, arg_setenv);
1647                                 if (!n) {
1648                                         log_oom();
1649                                         goto child_fail;
1650                                 }
1651
1652                                 env_use = n;
1653                         } else
1654                                 env_use = (char**) envp;
1655
1656 #ifdef HAVE_SELINUX
1657                         if (arg_selinux_context)
1658                                 if (setexeccon(arg_selinux_context) < 0)
1659                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1660 #endif
1661                         if (arg_boot) {
1662                                 char **a;
1663                                 size_t l;
1664
1665                                 /* Automatically search for the init system */
1666
1667                                 l = 1 + argc - optind;
1668                                 a = newa(char*, l + 1);
1669                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1670
1671                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1672                                 execve(a[0], a, env_use);
1673
1674                                 a[0] = (char*) "/lib/systemd/systemd";
1675                                 execve(a[0], a, env_use);
1676
1677                                 a[0] = (char*) "/sbin/init";
1678                                 execve(a[0], a, env_use);
1679                         } else if (argc > optind)
1680                                 execvpe(argv[optind], argv + optind, env_use);
1681                         else {
1682                                 chdir(home ? home : "/root");
1683                                 execle("/bin/bash", "-bash", NULL, env_use);
1684                         }
1685
1686                         log_error("execv() failed: %m");
1687
1688                 child_fail:
1689                         _exit(EXIT_FAILURE);
1690                 }
1691
1692                 fdset_free(fds);
1693                 fds = NULL;
1694
1695                 r = register_machine(pid);
1696                 if (r < 0)
1697                         goto finish;
1698
1699                 eventfd_write(sync_fd, 1);
1700                 close_nointr_nofail(sync_fd);
1701                 sync_fd = -1;
1702
1703                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1704                 if (k < 0) {
1705                         r = EXIT_FAILURE;
1706                         break;
1707                 }
1708
1709                 if (!arg_quiet)
1710                         putc('\n', stdout);
1711
1712                 /* Kill if it is not dead yet anyway */
1713                 terminate_machine(pid);
1714
1715                 /* Redundant, but better safe than sorry */
1716                 kill(pid, SIGKILL);
1717
1718                 k = wait_for_terminate(pid, &status);
1719                 pid = 0;
1720
1721                 if (k < 0) {
1722                         r = EXIT_FAILURE;
1723                         break;
1724                 }
1725
1726                 if (status.si_code == CLD_EXITED) {
1727                         r = status.si_status;
1728                         if (status.si_status != 0) {
1729                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1730                                 break;
1731                         }
1732
1733                         if (!arg_quiet)
1734                                 log_debug("Container %s exited successfully.", arg_machine);
1735                         break;
1736                 } else if (status.si_code == CLD_KILLED &&
1737                            status.si_status == SIGINT) {
1738
1739                         if (!arg_quiet)
1740                                 log_info("Container %s has been shut down.", arg_machine);
1741                         r = 0;
1742                         break;
1743                 } else if (status.si_code == CLD_KILLED &&
1744                            status.si_status == SIGHUP) {
1745
1746                         if (!arg_quiet)
1747                                 log_info("Container %s is being rebooted.", arg_machine);
1748                         continue;
1749                 } else if (status.si_code == CLD_KILLED ||
1750                            status.si_code == CLD_DUMPED) {
1751
1752                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1753                         r = EXIT_FAILURE;
1754                         break;
1755                 } else {
1756                         log_error("Container %s failed due to unknown reason.", arg_machine);
1757                         r = EXIT_FAILURE;
1758                         break;
1759                 }
1760         }
1761
1762 finish:
1763         if (pid > 0)
1764                 kill(pid, SIGKILL);
1765
1766         free(arg_directory);
1767         free(arg_machine);
1768         free(arg_setenv);
1769
1770         return r;
1771 }