1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
90 #include "siphash24.h"
92 #include "base-filesystem.h"
95 #include "seccomp-util.h"
98 typedef enum ContainerStatus {
103 typedef enum LinkJournal {
110 static char *arg_directory = NULL;
111 static char *arg_user = NULL;
112 static sd_id128_t arg_uuid = {};
113 static char *arg_machine = NULL;
114 static const char *arg_selinux_context = NULL;
115 static const char *arg_selinux_apifs_context = NULL;
116 static const char *arg_slice = NULL;
117 static bool arg_private_network = false;
118 static bool arg_read_only = false;
119 static bool arg_boot = false;
120 static LinkJournal arg_link_journal = LINK_AUTO;
121 static uint64_t arg_retain =
122 (1ULL << CAP_CHOWN) |
123 (1ULL << CAP_DAC_OVERRIDE) |
124 (1ULL << CAP_DAC_READ_SEARCH) |
125 (1ULL << CAP_FOWNER) |
126 (1ULL << CAP_FSETID) |
127 (1ULL << CAP_IPC_OWNER) |
129 (1ULL << CAP_LEASE) |
130 (1ULL << CAP_LINUX_IMMUTABLE) |
131 (1ULL << CAP_NET_BIND_SERVICE) |
132 (1ULL << CAP_NET_BROADCAST) |
133 (1ULL << CAP_NET_RAW) |
134 (1ULL << CAP_SETGID) |
135 (1ULL << CAP_SETFCAP) |
136 (1ULL << CAP_SETPCAP) |
137 (1ULL << CAP_SETUID) |
138 (1ULL << CAP_SYS_ADMIN) |
139 (1ULL << CAP_SYS_CHROOT) |
140 (1ULL << CAP_SYS_NICE) |
141 (1ULL << CAP_SYS_PTRACE) |
142 (1ULL << CAP_SYS_TTY_CONFIG) |
143 (1ULL << CAP_SYS_RESOURCE) |
144 (1ULL << CAP_SYS_BOOT) |
145 (1ULL << CAP_AUDIT_WRITE) |
146 (1ULL << CAP_AUDIT_CONTROL) |
148 static char **arg_bind = NULL;
149 static char **arg_bind_ro = NULL;
150 static char **arg_tmpfs = NULL;
151 static char **arg_setenv = NULL;
152 static bool arg_quiet = false;
153 static bool arg_share_system = false;
154 static bool arg_register = true;
155 static bool arg_keep_unit = false;
156 static char **arg_network_interfaces = NULL;
157 static char **arg_network_macvlan = NULL;
158 static bool arg_network_veth = false;
159 static const char *arg_network_bridge = NULL;
160 static unsigned long arg_personality = 0xffffffffLU;
161 static const char *arg_image = NULL;
163 static int help(void) {
165 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
166 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
167 " -h --help Show this help\n"
168 " --version Print version string\n"
169 " -q --quiet Do not show status information\n"
170 " -D --directory=PATH Root directory for the container\n"
171 " -i --image=PATH File system device or image for the container\n"
172 " -b --boot Boot up full system (i.e. invoke init)\n"
173 " -u --user=USER Run the command under specified user or uid\n"
174 " -M --machine=NAME Set the machine name for the container\n"
175 " --uuid=UUID Set a specific machine UUID for the container\n"
176 " -S --slice=SLICE Place the container in the specified slice\n"
177 " --private-network Disable network in container\n"
178 " --network-interface=INTERFACE\n"
179 " Assign an existing network interface to the\n"
181 " --network-macvlan=INTERFACE\n"
182 " Create a macvlan network interface based on an\n"
183 " existing network interface to the container\n"
184 " --network-veth Add a virtual ethernet connection between host\n"
186 " --network-bridge=INTERFACE\n"
187 " Add a virtual ethernet connection between host\n"
188 " and container and add it to an existing bridge on\n"
190 " -Z --selinux-context=SECLABEL\n"
191 " Set the SELinux security context to be used by\n"
192 " processes in the container\n"
193 " -L --selinux-apifs-context=SECLABEL\n"
194 " Set the SELinux security context to be used by\n"
195 " API/tmpfs file systems in the container\n"
196 " --capability=CAP In addition to the default, retain specified\n"
198 " --drop-capability=CAP Drop the specified capability from the default set\n"
199 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
200 " -j Equivalent to --link-journal=host\n"
201 " --read-only Mount the root directory read-only\n"
202 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
204 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
205 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
206 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
207 " --share-system Share system namespaces with host\n"
208 " --register=BOOLEAN Register container as machine\n"
209 " --keep-unit Do not register a scope for the machine, reuse\n"
210 " the service unit nspawn is running in\n",
211 program_invocation_short_name);
216 static int parse_argv(int argc, char *argv[]) {
233 ARG_NETWORK_INTERFACE,
240 static const struct option options[] = {
241 { "help", no_argument, NULL, 'h' },
242 { "version", no_argument, NULL, ARG_VERSION },
243 { "directory", required_argument, NULL, 'D' },
244 { "user", required_argument, NULL, 'u' },
245 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
246 { "boot", no_argument, NULL, 'b' },
247 { "uuid", required_argument, NULL, ARG_UUID },
248 { "read-only", no_argument, NULL, ARG_READ_ONLY },
249 { "capability", required_argument, NULL, ARG_CAPABILITY },
250 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
251 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
252 { "bind", required_argument, NULL, ARG_BIND },
253 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
254 { "tmpfs", required_argument, NULL, ARG_TMPFS },
255 { "machine", required_argument, NULL, 'M' },
256 { "slice", required_argument, NULL, 'S' },
257 { "setenv", required_argument, NULL, ARG_SETENV },
258 { "selinux-context", required_argument, NULL, 'Z' },
259 { "selinux-apifs-context", required_argument, NULL, 'L' },
260 { "quiet", no_argument, NULL, 'q' },
261 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
262 { "register", required_argument, NULL, ARG_REGISTER },
263 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
264 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
265 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
266 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
267 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
268 { "personality", required_argument, NULL, ARG_PERSONALITY },
269 { "image", required_argument, NULL, 'i' },
274 uint64_t plus = 0, minus = 0;
279 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
287 puts(PACKAGE_STRING);
288 puts(SYSTEMD_FEATURES);
293 arg_directory = canonicalize_file_name(optarg);
294 if (!arg_directory) {
295 log_error("Invalid root directory: %m");
307 arg_user = strdup(optarg);
313 case ARG_NETWORK_BRIDGE:
314 arg_network_bridge = optarg;
318 case ARG_NETWORK_VETH:
319 arg_network_veth = true;
320 arg_private_network = true;
323 case ARG_NETWORK_INTERFACE:
324 if (strv_extend(&arg_network_interfaces, optarg) < 0)
327 arg_private_network = true;
330 case ARG_NETWORK_MACVLAN:
331 if (strv_extend(&arg_network_macvlan, optarg) < 0)
336 case ARG_PRIVATE_NETWORK:
337 arg_private_network = true;
345 r = sd_id128_from_string(optarg, &arg_uuid);
347 log_error("Invalid UUID: %s", optarg);
357 if (isempty(optarg)) {
362 if (!hostname_is_valid(optarg)) {
363 log_error("Invalid machine name: %s", optarg);
368 arg_machine = strdup(optarg);
376 arg_selinux_context = optarg;
380 arg_selinux_apifs_context = optarg;
384 arg_read_only = true;
388 case ARG_DROP_CAPABILITY: {
392 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
393 _cleanup_free_ char *t;
396 t = strndup(word, length);
400 if (streq(t, "all")) {
401 if (c == ARG_CAPABILITY)
402 plus = (uint64_t) -1;
404 minus = (uint64_t) -1;
406 if (cap_from_name(t, &cap) < 0) {
407 log_error("Failed to parse capability %s.", t);
411 if (c == ARG_CAPABILITY)
412 plus |= 1ULL << (uint64_t) cap;
414 minus |= 1ULL << (uint64_t) cap;
422 arg_link_journal = LINK_GUEST;
425 case ARG_LINK_JOURNAL:
426 if (streq(optarg, "auto"))
427 arg_link_journal = LINK_AUTO;
428 else if (streq(optarg, "no"))
429 arg_link_journal = LINK_NO;
430 else if (streq(optarg, "guest"))
431 arg_link_journal = LINK_GUEST;
432 else if (streq(optarg, "host"))
433 arg_link_journal = LINK_HOST;
435 log_error("Failed to parse link journal mode %s", optarg);
443 _cleanup_free_ char *a = NULL, *b = NULL;
447 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
449 e = strchr(optarg, ':');
451 a = strndup(optarg, e - optarg);
461 if (!path_is_absolute(a) || !path_is_absolute(b)) {
462 log_error("Invalid bind mount specification: %s", optarg);
466 r = strv_extend(x, a);
470 r = strv_extend(x, b);
478 _cleanup_free_ char *a = NULL, *b = NULL;
481 e = strchr(optarg, ':');
483 a = strndup(optarg, e - optarg);
487 b = strdup("mode=0755");
493 if (!path_is_absolute(a)) {
494 log_error("Invalid tmpfs specification: %s", optarg);
498 r = strv_push(&arg_tmpfs, a);
504 r = strv_push(&arg_tmpfs, b);
516 if (!env_assignment_is_valid(optarg)) {
517 log_error("Environment variable assignment '%s' is not valid.", optarg);
521 n = strv_env_set(arg_setenv, optarg);
525 strv_free(arg_setenv);
534 case ARG_SHARE_SYSTEM:
535 arg_share_system = true;
539 r = parse_boolean(optarg);
541 log_error("Failed to parse --register= argument: %s", optarg);
549 arg_keep_unit = true;
552 case ARG_PERSONALITY:
554 arg_personality = personality_from_string(optarg);
555 if (arg_personality == 0xffffffffLU) {
556 log_error("Unknown or unsupported personality '%s'.", optarg);
566 assert_not_reached("Unhandled option");
570 if (arg_share_system)
571 arg_register = false;
573 if (arg_boot && arg_share_system) {
574 log_error("--boot and --share-system may not be combined.");
578 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
579 log_error("--keep-unit may not be used when invoked from a user session.");
583 if (arg_directory && arg_image) {
584 log_error("--directory= and --image= may not be combined.");
588 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
593 static int mount_all(const char *dest) {
595 typedef struct MountPoint {
604 static const MountPoint mount_table[] = {
605 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
606 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
607 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
608 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
609 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
610 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
611 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
612 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
614 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
615 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
622 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
623 _cleanup_free_ char *where = NULL;
625 _cleanup_free_ char *options = NULL;
630 where = strjoin(dest, "/", mount_table[k].where, NULL);
634 t = path_is_mount_point(where, true);
636 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
644 /* Skip this entry if it is not a remount. */
645 if (mount_table[k].what && t > 0)
648 mkdir_p(where, 0755);
651 if (arg_selinux_apifs_context &&
652 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
653 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
660 o = mount_table[k].options;
663 if (mount(mount_table[k].what,
666 mount_table[k].flags,
668 mount_table[k].fatal) {
670 log_error("mount(%s) failed: %m", where);
680 static int mount_binds(const char *dest, char **l, bool ro) {
683 STRV_FOREACH_PAIR(x, y, l) {
684 _cleanup_free_ char *where = NULL;
685 struct stat source_st, dest_st;
688 if (stat(*x, &source_st) < 0) {
689 log_error("Failed to stat %s: %m", *x);
693 where = strappend(dest, *y);
697 r = stat(where, &dest_st);
699 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
700 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
703 } else if (errno == ENOENT) {
704 r = mkdir_parents_label(where, 0755);
706 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
710 log_error("Failed to bind mount %s: %m", *x);
714 /* Create the mount point, but be conservative -- refuse to create block
715 * and char devices. */
716 if (S_ISDIR(source_st.st_mode))
717 mkdir_label(where, 0755);
718 else if (S_ISFIFO(source_st.st_mode))
720 else if (S_ISSOCK(source_st.st_mode))
721 mknod(where, 0644 | S_IFSOCK, 0);
722 else if (S_ISREG(source_st.st_mode))
725 log_error("Refusing to create mountpoint for file: %s", *x);
729 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
730 log_error("mount(%s) failed: %m", where);
735 r = bind_remount_recursive(where, true);
737 log_error("Read-Only bind mount failed: %s", strerror(-r));
746 static int mount_tmpfs(const char *dest) {
749 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
750 _cleanup_free_ char *where = NULL;
752 where = strappend(dest, *i);
756 mkdir_label(where, 0755);
758 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
759 log_error("tmpfs mount to %s failed: %m", where);
767 static int setup_timezone(const char *dest) {
768 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
774 /* Fix the timezone, if possible */
775 r = readlink_malloc("/etc/localtime", &p);
777 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
781 z = path_startswith(p, "../usr/share/zoneinfo/");
783 z = path_startswith(p, "/usr/share/zoneinfo/");
785 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
789 where = strappend(dest, "/etc/localtime");
793 r = readlink_malloc(where, &q);
795 y = path_startswith(q, "../usr/share/zoneinfo/");
797 y = path_startswith(q, "/usr/share/zoneinfo/");
800 /* Already pointing to the right place? Then do nothing .. */
801 if (y && streq(y, z))
805 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
809 if (access(check, F_OK) < 0) {
810 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
814 what = strappend("../usr/share/zoneinfo/", z);
819 if (symlink(what, where) < 0) {
820 log_error("Failed to correct timezone of container: %m");
827 static int setup_resolv_conf(const char *dest) {
828 _cleanup_free_ char *where = NULL;
832 if (arg_private_network)
835 /* Fix resolv.conf, if possible */
836 where = strappend(dest, "/etc/resolv.conf");
840 /* We don't really care for the results of this really. If it
841 * fails, it fails, but meh... */
842 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
847 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
850 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
851 SD_ID128_FORMAT_VAL(id));
856 static int setup_boot_id(const char *dest) {
857 _cleanup_free_ char *from = NULL, *to = NULL;
864 if (arg_share_system)
867 /* Generate a new randomized boot ID, so that each boot-up of
868 * the container gets a new one */
870 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
871 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
875 r = sd_id128_randomize(&rnd);
877 log_error("Failed to generate random boot id: %s", strerror(-r));
881 id128_format_as_uuid(rnd, as_uuid);
883 r = write_string_file(from, as_uuid);
885 log_error("Failed to write boot id: %s", strerror(-r));
889 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
890 log_error("Failed to bind mount boot id: %m");
892 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
893 log_warning("Failed to make boot id read-only: %m");
899 static int copy_devnodes(const char *dest) {
901 static const char devnodes[] =
911 _cleanup_umask_ mode_t u;
917 NULSTR_FOREACH(d, devnodes) {
918 _cleanup_free_ char *from = NULL, *to = NULL;
921 from = strappend("/dev/", d);
922 to = strjoin(dest, "/dev/", d, NULL);
926 if (stat(from, &st) < 0) {
928 if (errno != ENOENT) {
929 log_error("Failed to stat %s: %m", from);
933 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
935 log_error("%s is not a char or block device, cannot copy", from);
938 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
940 log_error("mknod(%s) failed: %m", dest);
948 static int setup_ptmx(const char *dest) {
949 _cleanup_free_ char *p = NULL;
951 p = strappend(dest, "/dev/ptmx");
955 if (symlink("pts/ptmx", p) < 0) {
956 log_error("Failed to create /dev/ptmx symlink: %m");
963 static int setup_dev_console(const char *dest, const char *console) {
964 _cleanup_umask_ mode_t u;
974 if (stat("/dev/null", &st) < 0) {
975 log_error("Failed to stat /dev/null: %m");
979 r = chmod_and_chown(console, 0600, 0, 0);
981 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
985 /* We need to bind mount the right tty to /dev/console since
986 * ptys can only exist on pts file systems. To have something
987 * to bind mount things on we create a device node first, and
988 * use /dev/null for that since we the cgroups device policy
989 * allows us to create that freely, while we cannot create
990 * /dev/console. (Note that the major minor doesn't actually
991 * matter here, since we mount it over anyway). */
993 to = strappenda(dest, "/dev/console");
994 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
995 log_error("mknod() for /dev/console failed: %m");
999 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1000 log_error("Bind mount for /dev/console failed: %m");
1007 static int setup_kmsg(const char *dest, int kmsg_socket) {
1008 _cleanup_free_ char *from = NULL, *to = NULL;
1010 _cleanup_umask_ mode_t u;
1012 struct cmsghdr cmsghdr;
1013 uint8_t buf[CMSG_SPACE(sizeof(int))];
1015 struct msghdr mh = {
1016 .msg_control = &control,
1017 .msg_controllen = sizeof(control),
1019 struct cmsghdr *cmsg;
1022 assert(kmsg_socket >= 0);
1026 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1027 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1028 * on the reading side behave very similar to /proc/kmsg,
1029 * their writing side behaves differently from /dev/kmsg in
1030 * that writing blocks when nothing is reading. In order to
1031 * avoid any problems with containers deadlocking due to this
1032 * we simply make /dev/kmsg unavailable to the container. */
1033 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1034 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1037 if (mkfifo(from, 0600) < 0) {
1038 log_error("mkfifo() for /dev/kmsg failed: %m");
1042 r = chmod_and_chown(from, 0600, 0, 0);
1044 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1048 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1049 log_error("Bind mount for /proc/kmsg failed: %m");
1053 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1055 log_error("Failed to open fifo: %m");
1059 cmsg = CMSG_FIRSTHDR(&mh);
1060 cmsg->cmsg_level = SOL_SOCKET;
1061 cmsg->cmsg_type = SCM_RIGHTS;
1062 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1063 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1065 mh.msg_controllen = cmsg->cmsg_len;
1067 /* Store away the fd in the socket, so that it stays open as
1068 * long as we run the child */
1069 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1073 log_error("Failed to send FIFO fd: %m");
1077 /* And now make the FIFO unavailable as /dev/kmsg... */
1082 static int setup_hostname(void) {
1084 if (arg_share_system)
1087 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1093 static int setup_journal(const char *directory) {
1094 sd_id128_t machine_id, this_id;
1095 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1099 p = strappend(directory, "/etc/machine-id");
1103 r = read_one_line_file(p, &b);
1104 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1107 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1112 if (isempty(id) && arg_link_journal == LINK_AUTO)
1115 /* Verify validity */
1116 r = sd_id128_from_string(id, &machine_id);
1118 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1122 r = sd_id128_get_machine(&this_id);
1124 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1128 if (sd_id128_equal(machine_id, this_id)) {
1129 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1130 "Host and machine ids are equal (%s): refusing to link journals", id);
1131 if (arg_link_journal == LINK_AUTO)
1137 if (arg_link_journal == LINK_NO)
1141 p = strappend("/var/log/journal/", id);
1142 q = strjoin(directory, "/var/log/journal/", id, NULL);
1146 if (path_is_mount_point(p, false) > 0) {
1147 if (arg_link_journal != LINK_AUTO) {
1148 log_error("%s: already a mount point, refusing to use for journal", p);
1155 if (path_is_mount_point(q, false) > 0) {
1156 if (arg_link_journal != LINK_AUTO) {
1157 log_error("%s: already a mount point, refusing to use for journal", q);
1164 r = readlink_and_make_absolute(p, &d);
1166 if ((arg_link_journal == LINK_GUEST ||
1167 arg_link_journal == LINK_AUTO) &&
1170 r = mkdir_p(q, 0755);
1172 log_warning("failed to create directory %s: %m", q);
1176 if (unlink(p) < 0) {
1177 log_error("Failed to remove symlink %s: %m", p);
1180 } else if (r == -EINVAL) {
1182 if (arg_link_journal == LINK_GUEST &&
1185 if (errno == ENOTDIR) {
1186 log_error("%s already exists and is neither a symlink nor a directory", p);
1189 log_error("Failed to remove %s: %m", p);
1193 } else if (r != -ENOENT) {
1194 log_error("readlink(%s) failed: %m", p);
1198 if (arg_link_journal == LINK_GUEST) {
1200 if (symlink(q, p) < 0) {
1201 log_error("Failed to symlink %s to %s: %m", q, p);
1205 r = mkdir_p(q, 0755);
1207 log_warning("failed to create directory %s: %m", q);
1211 if (arg_link_journal == LINK_HOST) {
1212 r = mkdir_p(p, 0755);
1214 log_error("Failed to create %s: %m", p);
1218 } else if (access(p, F_OK) < 0)
1221 if (dir_is_empty(q) == 0)
1222 log_warning("%s is not empty, proceeding anyway.", q);
1224 r = mkdir_p(q, 0755);
1226 log_error("Failed to create %s: %m", q);
1230 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1231 log_error("Failed to bind mount journal from host into guest: %m");
1238 static int setup_kdbus(const char *dest, const char *path) {
1244 p = strappenda(dest, "/dev/kdbus");
1245 if (mkdir(p, 0755) < 0) {
1246 log_error("Failed to create kdbus path: %m");
1250 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1251 log_error("Failed to mount kdbus domain path: %m");
1258 static int drop_capabilities(void) {
1259 return capability_bounding_set_drop(~arg_retain, false);
1262 static int register_machine(pid_t pid) {
1263 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1264 _cleanup_bus_unref_ sd_bus *bus = NULL;
1270 r = sd_bus_default_system(&bus);
1272 log_error("Failed to open system bus: %s", strerror(-r));
1276 if (arg_keep_unit) {
1277 r = sd_bus_call_method(
1279 "org.freedesktop.machine1",
1280 "/org/freedesktop/machine1",
1281 "org.freedesktop.machine1.Manager",
1287 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1291 strempty(arg_directory));
1293 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1295 r = sd_bus_message_new_method_call(
1298 "org.freedesktop.machine1",
1299 "/org/freedesktop/machine1",
1300 "org.freedesktop.machine1.Manager",
1303 log_error("Failed to create message: %s", strerror(-r));
1307 r = sd_bus_message_append(
1311 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1315 strempty(arg_directory));
1317 log_error("Failed to append message arguments: %s", strerror(-r));
1321 r = sd_bus_message_open_container(m, 'a', "(sv)");
1323 log_error("Failed to open container: %s", strerror(-r));
1327 if (!isempty(arg_slice)) {
1328 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1330 log_error("Failed to append slice: %s", strerror(-r));
1335 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1337 log_error("Failed to add device policy: %s", strerror(-r));
1341 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1342 /* Allow the container to
1343 * access and create the API
1344 * device nodes, so that
1345 * PrivateDevices= in the
1346 * container can work
1351 "/dev/random", "rwm",
1352 "/dev/urandom", "rwm",
1354 /* Allow the container
1355 * access to ptys. However,
1357 * container to ever create
1358 * these device nodes. */
1359 "/dev/pts/ptmx", "rw",
1361 /* Allow the container
1362 * access to all kdbus
1363 * devices. Again, the
1364 * container cannot create
1365 * these nodes, only use
1366 * them. We use a pretty
1367 * open match here, so that
1368 * the kernel API can still
1371 "char-kdbus/*", "rw");
1373 log_error("Failed to add device whitelist: %s", strerror(-r));
1377 r = sd_bus_message_close_container(m);
1379 log_error("Failed to close container: %s", strerror(-r));
1383 r = sd_bus_call(bus, m, 0, &error, NULL);
1387 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1394 static int terminate_machine(pid_t pid) {
1395 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1396 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1397 _cleanup_bus_unref_ sd_bus *bus = NULL;
1404 r = sd_bus_default_system(&bus);
1406 log_error("Failed to open system bus: %s", strerror(-r));
1410 r = sd_bus_call_method(
1412 "org.freedesktop.machine1",
1413 "/org/freedesktop/machine1",
1414 "org.freedesktop.machine1.Manager",
1421 /* Note that the machine might already have been
1422 * cleaned up automatically, hence don't consider it a
1423 * failure if we cannot get the machine object. */
1424 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1428 r = sd_bus_message_read(reply, "o", &path);
1430 return bus_log_parse_error(r);
1432 r = sd_bus_call_method(
1434 "org.freedesktop.machine1",
1436 "org.freedesktop.machine1.Machine",
1442 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1449 static int reset_audit_loginuid(void) {
1450 _cleanup_free_ char *p = NULL;
1453 if (arg_share_system)
1456 r = read_one_line_file("/proc/self/loginuid", &p);
1460 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1464 /* Already reset? */
1465 if (streq(p, "4294967295"))
1468 r = write_string_file("/proc/self/loginuid", "4294967295");
1470 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1471 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1472 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1473 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1474 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1482 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1484 static int get_mac(struct ether_addr *mac) {
1491 l = strlen(arg_machine);
1492 sz = sizeof(sd_id128_t) + l;
1495 /* fetch some persistent data unique to the host */
1496 r = sd_id128_get_machine((sd_id128_t*) v);
1500 /* combine with some data unique (on this host) to this
1501 * container instance */
1502 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1504 /* Let's hash the host machine ID plus the container name. We
1505 * use a fixed, but originally randomly created hash key here. */
1506 siphash24(result, v, sz, HASH_KEY.bytes);
1508 assert_cc(ETH_ALEN <= sizeof(result));
1509 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1511 /* see eth_random_addr in the kernel */
1512 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1513 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1518 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1519 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1520 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1521 struct ether_addr mac;
1524 if (!arg_private_network)
1527 if (!arg_network_veth)
1530 /* Use two different interface name prefixes depending whether
1531 * we are in bridge mode or not. */
1532 if (arg_network_bridge)
1533 memcpy(iface_name, "vb-", 3);
1535 memcpy(iface_name, "ve-", 3);
1536 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1540 log_error("Failed to generate predictable MAC address for host0");
1544 r = sd_rtnl_open(&rtnl, 0);
1546 log_error("Failed to connect to netlink: %s", strerror(-r));
1550 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1552 log_error("Failed to allocate netlink message: %s", strerror(-r));
1556 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1558 log_error("Failed to add netlink interface name: %s", strerror(-r));
1562 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1564 log_error("Failed to open netlink container: %s", strerror(-r));
1568 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1570 log_error("Failed to open netlink container: %s", strerror(-r));
1574 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1576 log_error("Failed to open netlink container: %s", strerror(-r));
1580 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1582 log_error("Failed to add netlink interface name: %s", strerror(-r));
1586 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1588 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1592 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1594 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1598 r = sd_rtnl_message_close_container(m);
1600 log_error("Failed to close netlink container: %s", strerror(-r));
1604 r = sd_rtnl_message_close_container(m);
1606 log_error("Failed to close netlink container: %s", strerror(-r));
1610 r = sd_rtnl_message_close_container(m);
1612 log_error("Failed to close netlink container: %s", strerror(-r));
1616 r = sd_rtnl_call(rtnl, m, 0, NULL);
1618 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1625 static int setup_bridge(const char veth_name[]) {
1626 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1627 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1630 if (!arg_private_network)
1633 if (!arg_network_veth)
1636 if (!arg_network_bridge)
1639 bridge = (int) if_nametoindex(arg_network_bridge);
1641 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1645 r = sd_rtnl_open(&rtnl, 0);
1647 log_error("Failed to connect to netlink: %s", strerror(-r));
1651 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1653 log_error("Failed to allocate netlink message: %s", strerror(-r));
1657 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1659 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1663 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1665 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1669 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1671 log_error("Failed to add netlink master field: %s", strerror(-r));
1675 r = sd_rtnl_call(rtnl, m, 0, NULL);
1677 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1684 static int parse_interface(struct udev *udev, const char *name) {
1685 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1686 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1689 ifi = (int) if_nametoindex(name);
1691 log_error("Failed to resolve interface %s: %m", name);
1695 sprintf(ifi_str, "n%i", ifi);
1696 d = udev_device_new_from_device_id(udev, ifi_str);
1698 log_error("Failed to get udev device for interface %s: %m", name);
1702 if (udev_device_get_is_initialized(d) <= 0) {
1703 log_error("Network interface %s is not initialized yet.", name);
1710 static int move_network_interfaces(pid_t pid) {
1711 _cleanup_udev_unref_ struct udev *udev = NULL;
1712 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1716 if (!arg_private_network)
1719 if (strv_isempty(arg_network_interfaces))
1722 r = sd_rtnl_open(&rtnl, 0);
1724 log_error("Failed to connect to netlink: %s", strerror(-r));
1730 log_error("Failed to connect to udev.");
1734 STRV_FOREACH(i, arg_network_interfaces) {
1735 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1738 ifi = parse_interface(udev, *i);
1742 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1744 log_error("Failed to allocate netlink message: %s", strerror(-r));
1748 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1750 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1754 r = sd_rtnl_call(rtnl, m, 0, NULL);
1756 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1764 static int setup_macvlan(pid_t pid) {
1765 _cleanup_udev_unref_ struct udev *udev = NULL;
1766 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1770 if (!arg_private_network)
1773 if (strv_isempty(arg_network_macvlan))
1776 r = sd_rtnl_open(&rtnl, 0);
1778 log_error("Failed to connect to netlink: %s", strerror(-r));
1784 log_error("Failed to connect to udev.");
1788 STRV_FOREACH(i, arg_network_macvlan) {
1789 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1790 _cleanup_free_ char *n = NULL;
1793 ifi = parse_interface(udev, *i);
1797 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1799 log_error("Failed to allocate netlink message: %s", strerror(-r));
1803 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1805 log_error("Failed to add netlink interface index: %s", strerror(-r));
1809 n = strappend("mv-", *i);
1813 strshorten(n, IFNAMSIZ-1);
1815 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1817 log_error("Failed to add netlink interface name: %s", strerror(-r));
1821 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1823 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1827 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1829 log_error("Failed to open netlink container: %s", strerror(-r));
1833 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1835 log_error("Failed to open netlink container: %s", strerror(-r));
1839 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1841 log_error("Failed to append macvlan mode: %s", strerror(-r));
1845 r = sd_rtnl_message_close_container(m);
1847 log_error("Failed to close netlink container: %s", strerror(-r));
1851 r = sd_rtnl_message_close_container(m);
1853 log_error("Failed to close netlink container: %s", strerror(-r));
1857 r = sd_rtnl_call(rtnl, m, 0, NULL);
1859 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1867 static int audit_still_doesnt_work_in_containers(void) {
1870 scmp_filter_ctx seccomp;
1874 Audit is broken in containers, much of the userspace audit
1875 hookup will fail if running inside a container. We don't
1876 care and just turn off creation of audit sockets.
1878 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1879 with EAFNOSUPPORT which audit userspace uses as indication
1880 that audit is disabled in the kernel.
1883 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1887 r = seccomp_add_secondary_archs(seccomp);
1889 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1893 r = seccomp_rule_add(
1895 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1898 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1899 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1901 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1905 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1907 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1911 r = seccomp_load(seccomp);
1913 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1916 seccomp_release(seccomp);
1924 static int setup_image(char **device_path, int *loop_nr) {
1925 struct loop_info64 info = {
1926 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1928 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1929 _cleanup_free_ char* loopdev = NULL;
1933 assert(device_path);
1936 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1938 log_error("Failed to open %s: %m", arg_image);
1942 if (fstat(fd, &st) < 0) {
1943 log_error("Failed to stat %s: %m", arg_image);
1947 if (S_ISBLK(st.st_mode)) {
1950 p = strdup(arg_image);
1964 if (!S_ISREG(st.st_mode)) {
1965 log_error("%s is not a regular file or block device: %m", arg_image);
1969 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1971 log_error("Failed to open /dev/loop-control: %m");
1975 nr = ioctl(control, LOOP_CTL_GET_FREE);
1977 log_error("Failed to allocate loop device: %m");
1981 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1984 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1986 log_error("Failed to open loop device %s: %m", loopdev);
1990 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1991 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1996 info.lo_flags |= LO_FLAGS_READ_ONLY;
1998 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1999 log_error("Failed to set loopback settings on %s: %m", loopdev);
2003 *device_path = loopdev;
2014 static int dissect_image(
2016 char **root_device, bool *root_device_rw,
2017 char **home_device, bool *home_device_rw,
2018 char **srv_device, bool *srv_device_rw,
2022 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2023 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2024 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2025 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2026 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2027 _cleanup_udev_unref_ struct udev *udev = NULL;
2028 struct udev_list_entry *first, *item;
2029 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2030 const char *pttype = NULL;
2036 assert(root_device);
2037 assert(home_device);
2041 b = blkid_new_probe();
2046 r = blkid_probe_set_device(b, fd, 0, 0);
2051 log_error("Failed to set device on blkid probe: %m");
2055 blkid_probe_enable_partitions(b, 1);
2056 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2059 r = blkid_do_safeprobe(b);
2060 if (r == -2 || r == 1) {
2061 log_error("Failed to identify any partition table on %s.\n"
2062 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2064 } else if (r != 0) {
2067 log_error("Failed to probe: %m");
2071 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2072 if (!streq_ptr(pttype, "gpt")) {
2073 log_error("Image %s does not carry a GUID Partition Table.\n"
2074 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2079 pl = blkid_probe_get_partitions(b);
2084 log_error("Failed to list partitions of %s", arg_image);
2092 if (fstat(fd, &st) < 0) {
2093 log_error("Failed to stat block device: %m");
2097 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2101 e = udev_enumerate_new(udev);
2105 r = udev_enumerate_add_match_parent(e, d);
2109 r = udev_enumerate_scan_devices(e);
2111 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2115 first = udev_enumerate_get_list_entry(e);
2116 udev_list_entry_foreach(item, first) {
2117 _cleanup_udev_device_unref_ struct udev_device *q;
2118 const char *stype, *node;
2119 unsigned long long flags;
2126 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2131 log_error("Failed to get partition device of %s: %m", arg_image);
2135 qn = udev_device_get_devnum(q);
2139 if (st.st_rdev == qn)
2142 node = udev_device_get_devnode(q);
2146 pp = blkid_partlist_devno_to_partition(pl, qn);
2150 flags = blkid_partition_get_flags(pp);
2151 if (flags & GPT_FLAG_NO_AUTO)
2154 nr = blkid_partition_get_partno(pp);
2158 stype = blkid_partition_get_type_string(pp);
2162 if (sd_id128_from_string(stype, &type_id) < 0)
2165 if (sd_id128_equal(type_id, GPT_HOME)) {
2167 if (home && nr >= home_nr)
2171 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2174 home = strdup(node);
2177 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2179 if (srv && nr >= srv_nr)
2183 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2190 #ifdef GPT_ROOT_NATIVE
2191 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2193 if (root && nr >= root_nr)
2197 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2200 root = strdup(node);
2205 #ifdef GPT_ROOT_SECONDARY
2206 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2208 if (secondary_root && nr >= secondary_root_nr)
2211 secondary_root_nr = nr;
2212 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2215 free(secondary_root);
2216 secondary_root = strdup(node);
2217 if (!secondary_root)
2223 if (!root && !secondary_root) {
2224 log_error("Failed to identify root partition in disk image %s.\n"
2225 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2230 *root_device = root;
2233 *root_device_rw = root_rw;
2235 } else if (secondary_root) {
2236 *root_device = secondary_root;
2237 secondary_root = NULL;
2239 *root_device_rw = secondary_root_rw;
2244 *home_device = home;
2247 *home_device_rw = home_rw;
2254 *srv_device_rw = srv_rw;
2259 log_error("--image= is not supported, compiled without blkid support.");
2264 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2266 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2267 const char *fstype, *p;
2277 p = strappenda(where, directory);
2282 b = blkid_new_probe_from_filename(what);
2286 log_error("Failed to allocate prober for %s: %m", what);
2290 blkid_probe_enable_superblocks(b, 1);
2291 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2294 r = blkid_do_safeprobe(b);
2295 if (r == -1 || r == 1) {
2296 log_error("Cannot determine file system type of %s", what);
2298 } else if (r != 0) {
2301 log_error("Failed to probe %s: %m", what);
2306 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2309 log_error("Failed to determine file system type of %s", what);
2313 if (streq(fstype, "crypto_LUKS")) {
2314 log_error("nspawn currently does not support LUKS disk images.");
2318 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2319 log_error("Failed to mount %s: %m", what);
2325 log_error("--image= is not supported, compiled without blkid support.");
2330 static int mount_devices(
2332 const char *root_device, bool root_device_rw,
2333 const char *home_device, bool home_device_rw,
2334 const char *srv_device, bool srv_device_rw) {
2340 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2342 log_error("Failed to mount root directory: %s", strerror(-r));
2348 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2350 log_error("Failed to mount home directory: %s", strerror(-r));
2356 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2358 log_error("Failed to mount server data directory: %s", strerror(-r));
2366 static void loop_remove(int nr, int *image_fd) {
2367 _cleanup_close_ int control = -1;
2372 if (image_fd && *image_fd >= 0) {
2373 ioctl(*image_fd, LOOP_CLR_FD);
2374 *image_fd = safe_close(*image_fd);
2377 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2381 ioctl(control, LOOP_CTL_REMOVE, nr);
2384 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2392 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2393 log_error("Failed to allocate pipe: %m");
2399 log_error("Failed to fork getent child: %m");
2401 } else if (pid == 0) {
2403 char *empty_env = NULL;
2405 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2406 _exit(EXIT_FAILURE);
2408 if (pipe_fds[0] > 2)
2409 safe_close(pipe_fds[0]);
2410 if (pipe_fds[1] > 2)
2411 safe_close(pipe_fds[1]);
2413 nullfd = open("/dev/null", O_RDWR);
2415 _exit(EXIT_FAILURE);
2417 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2418 _exit(EXIT_FAILURE);
2420 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2421 _exit(EXIT_FAILURE);
2426 reset_all_signal_handlers();
2427 close_all_fds(NULL, 0);
2429 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2430 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2431 _exit(EXIT_FAILURE);
2434 pipe_fds[1] = safe_close(pipe_fds[1]);
2441 static int change_uid_gid(char **_home) {
2442 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2443 _cleanup_free_ uid_t *uids = NULL;
2444 _cleanup_free_ char *home = NULL;
2445 _cleanup_fclose_ FILE *f = NULL;
2446 _cleanup_close_ int fd = -1;
2447 unsigned n_uids = 0;
2456 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2457 /* Reset everything fully to 0, just in case */
2459 if (setgroups(0, NULL) < 0) {
2460 log_error("setgroups() failed: %m");
2464 if (setresgid(0, 0, 0) < 0) {
2465 log_error("setregid() failed: %m");
2469 if (setresuid(0, 0, 0) < 0) {
2470 log_error("setreuid() failed: %m");
2478 /* First, get user credentials */
2479 fd = spawn_getent("passwd", arg_user, &pid);
2483 f = fdopen(fd, "r");
2488 if (!fgets(line, sizeof(line), f)) {
2491 log_error("Failed to resolve user %s.", arg_user);
2495 log_error("Failed to read from getent: %m");
2501 wait_for_terminate_and_warn("getent passwd", pid);
2503 x = strchr(line, ':');
2505 log_error("/etc/passwd entry has invalid user field.");
2509 u = strchr(x+1, ':');
2511 log_error("/etc/passwd entry has invalid password field.");
2518 log_error("/etc/passwd entry has invalid UID field.");
2526 log_error("/etc/passwd entry has invalid GID field.");
2531 h = strchr(x+1, ':');
2533 log_error("/etc/passwd entry has invalid GECOS field.");
2540 log_error("/etc/passwd entry has invalid home directory field.");
2546 r = parse_uid(u, &uid);
2548 log_error("Failed to parse UID of user.");
2552 r = parse_gid(g, &gid);
2554 log_error("Failed to parse GID of user.");
2562 /* Second, get group memberships */
2563 fd = spawn_getent("initgroups", arg_user, &pid);
2568 f = fdopen(fd, "r");
2573 if (!fgets(line, sizeof(line), f)) {
2575 log_error("Failed to resolve user %s.", arg_user);
2579 log_error("Failed to read from getent: %m");
2585 wait_for_terminate_and_warn("getent initgroups", pid);
2587 /* Skip over the username and subsequent separator whitespace */
2589 x += strcspn(x, WHITESPACE);
2590 x += strspn(x, WHITESPACE);
2592 FOREACH_WORD(w, l, x, state) {
2598 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2601 r = parse_uid(c, &uids[n_uids++]);
2603 log_error("Failed to parse group data from getent.");
2608 r = mkdir_parents(home, 0775);
2610 log_error("Failed to make home root directory: %s", strerror(-r));
2614 r = mkdir_safe(home, 0755, uid, gid);
2615 if (r < 0 && r != -EEXIST) {
2616 log_error("Failed to make home directory: %s", strerror(-r));
2620 fchown(STDIN_FILENO, uid, gid);
2621 fchown(STDOUT_FILENO, uid, gid);
2622 fchown(STDERR_FILENO, uid, gid);
2624 if (setgroups(n_uids, uids) < 0) {
2625 log_error("Failed to set auxiliary groups: %m");
2629 if (setresgid(gid, gid, gid) < 0) {
2630 log_error("setregid() failed: %m");
2634 if (setresuid(uid, uid, uid) < 0) {
2635 log_error("setreuid() failed: %m");
2649 * < 0 : wait_for_terminate() failed to get the state of the
2650 * container, the container was terminated by a signal, or
2651 * failed for an unknown reason. No change is made to the
2652 * container argument.
2653 * > 0 : The program executed in the container terminated with an
2654 * error. The exit code of the program executed in the
2655 * container is returned. No change is made to the container
2657 * 0 : The container is being rebooted, has been shut down or exited
2658 * successfully. The container argument has been set to either
2659 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2661 * That is, success is indicated by a return value of zero, and an
2662 * error is indicated by a non-zero value.
2664 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2668 r = wait_for_terminate(pid, &status);
2670 log_warning("Failed to wait for container: %s", strerror(-r));
2674 switch (status.si_code) {
2676 r = status.si_status;
2679 log_debug("Container %s exited successfully.",
2682 *container = CONTAINER_TERMINATED;
2684 log_error("Container %s failed with error code %i.",
2685 arg_machine, status.si_status);
2690 if (status.si_status == SIGINT) {
2692 log_info("Container %s has been shut down.",
2695 *container = CONTAINER_TERMINATED;
2698 } else if (status.si_status == SIGHUP) {
2700 log_info("Container %s is being rebooted.",
2703 *container = CONTAINER_REBOOTED;
2707 /* CLD_KILLED fallthrough */
2710 log_error("Container %s terminated by signal %s.",
2711 arg_machine, signal_to_string(status.si_status));
2716 log_error("Container %s failed due to unknown reason.",
2725 static void nop_handler(int sig) {}
2727 int main(int argc, char *argv[]) {
2729 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2730 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2731 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2732 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2733 _cleanup_fdset_free_ FDSet *fds = NULL;
2734 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2735 const char *console = NULL;
2736 char veth_name[IFNAMSIZ];
2737 bool secondary = false;
2738 sigset_t mask, mask_chld;
2741 log_parse_environment();
2744 k = parse_argv(argc, argv);
2753 if (arg_directory) {
2756 p = path_make_absolute_cwd(arg_directory);
2757 free(arg_directory);
2760 arg_directory = get_current_dir_name();
2762 if (!arg_directory) {
2763 log_error("Failed to determine path, please use -D.");
2766 path_kill_slashes(arg_directory);
2770 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2776 hostname_cleanup(arg_machine, false);
2777 if (isempty(arg_machine)) {
2778 log_error("Failed to determine machine name automatically, please use -M.");
2783 if (geteuid() != 0) {
2784 log_error("Need to be root.");
2788 if (sd_booted() <= 0) {
2789 log_error("Not running on a systemd system.");
2794 n_fd_passed = sd_listen_fds(false);
2795 if (n_fd_passed > 0) {
2796 k = fdset_new_listen_fds(&fds, false);
2798 log_error("Failed to collect file descriptors: %s", strerror(-k));
2802 fdset_close_others(fds);
2805 if (arg_directory) {
2806 if (path_equal(arg_directory, "/")) {
2807 log_error("Spawning container on root directory not supported.");
2812 if (path_is_os_tree(arg_directory) <= 0) {
2813 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2819 p = strappenda(arg_directory,
2820 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2821 if (access(p, F_OK) < 0) {
2822 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2828 char template[] = "/tmp/nspawn-root-XXXXXX";
2830 if (!mkdtemp(template)) {
2831 log_error("Failed to create temporary directory: %m");
2836 arg_directory = strdup(template);
2837 if (!arg_directory) {
2842 image_fd = setup_image(&device_path, &loop_nr);
2848 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2853 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2855 log_error("Failed to acquire pseudo tty: %m");
2859 console = ptsname(master);
2861 log_error("Failed to determine tty name: %m");
2866 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2867 arg_machine, arg_image ? arg_image : arg_directory);
2869 if (unlockpt(master) < 0) {
2870 log_error("Failed to unlock tty: %m");
2874 if (access("/dev/kdbus/control", F_OK) >= 0) {
2876 if (arg_share_system) {
2877 kdbus_domain = strdup("/dev/kdbus");
2878 if (!kdbus_domain) {
2885 ns = strappenda("machine-", arg_machine);
2886 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2888 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2890 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2894 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2895 log_error("Failed to create kmsg socket pair: %m");
2899 sd_notify(0, "READY=1");
2901 assert_se(sigemptyset(&mask) == 0);
2902 assert_se(sigemptyset(&mask_chld) == 0);
2903 sigaddset(&mask_chld, SIGCHLD);
2904 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2905 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2908 ContainerStatus container_status;
2909 int eventfds[2] = { -1, -1 };
2910 struct sigaction sa = {
2911 .sa_handler = nop_handler,
2912 .sa_flags = SA_NOCLDSTOP,
2915 /* Child can be killed before execv(), so handle SIGCHLD
2916 * in order to interrupt parent's blocking calls and
2917 * give it a chance to call wait() and terminate. */
2918 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2920 log_error("Failed to change the signal mask: %m");
2924 r = sigaction(SIGCHLD, &sa, NULL);
2926 log_error("Failed to install SIGCHLD handler: %m");
2930 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2931 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2932 (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2934 if (errno == EINVAL)
2935 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2937 log_error("clone() failed: %m");
2945 _cleanup_free_ char *home = NULL;
2947 const char *envp[] = {
2948 "PATH=" DEFAULT_PATH_SPLIT_USR,
2949 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2954 NULL, /* container_uuid */
2955 NULL, /* LISTEN_FDS */
2956 NULL, /* LISTEN_PID */
2961 envp[n_env] = strv_find_prefix(environ, "TERM=");
2965 master = safe_close(master);
2967 close_nointr(STDIN_FILENO);
2968 close_nointr(STDOUT_FILENO);
2969 close_nointr(STDERR_FILENO);
2971 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2973 reset_all_signal_handlers();
2975 assert_se(sigemptyset(&mask) == 0);
2976 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2978 k = open_terminal(console, O_RDWR);
2979 if (k != STDIN_FILENO) {
2985 log_error("Failed to open console: %s", strerror(-k));
2989 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2990 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2991 log_error("Failed to duplicate console: %m");
2996 log_error("setsid() failed: %m");
3000 if (reset_audit_loginuid() < 0)
3003 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3004 log_error("PR_SET_PDEATHSIG failed: %m");
3008 /* Mark everything as slave, so that we still
3009 * receive mounts from the real root, but don't
3010 * propagate mounts to the real root. */
3011 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3012 log_error("MS_SLAVE|MS_REC failed: %m");
3016 if (mount_devices(arg_directory,
3017 root_device, root_device_rw,
3018 home_device, home_device_rw,
3019 srv_device, srv_device_rw) < 0)
3022 r = base_filesystem_create(arg_directory);
3024 log_error("Failed to create the base filesystem: %s", strerror(-r));
3028 /* Turn directory into bind mount */
3029 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3030 log_error("Failed to make bind mount: %m");
3034 if (arg_read_only) {
3035 k = bind_remount_recursive(arg_directory, true);
3037 log_error("Failed to make tree read-only: %s", strerror(-k));
3042 if (mount_all(arg_directory) < 0)
3045 if (copy_devnodes(arg_directory) < 0)
3048 if (setup_ptmx(arg_directory) < 0)
3051 dev_setup(arg_directory);
3053 if (audit_still_doesnt_work_in_containers() < 0)
3056 if (setup_dev_console(arg_directory, console) < 0)
3059 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3062 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3064 if (setup_boot_id(arg_directory) < 0)
3067 if (setup_timezone(arg_directory) < 0)
3070 if (setup_resolv_conf(arg_directory) < 0)
3073 if (setup_journal(arg_directory) < 0)
3076 if (mount_binds(arg_directory, arg_bind, false) < 0)
3079 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3082 if (mount_tmpfs(arg_directory) < 0)
3085 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3088 /* Tell the parent that we are ready, and that
3089 * it can cgroupify us to that we lack access
3090 * to certain devices and resources. */
3091 r = eventfd_send_state(eventfds[1],
3092 EVENTFD_CHILD_SUCCEEDED);
3093 eventfds[1] = safe_close(eventfds[1]);
3097 if (chdir(arg_directory) < 0) {
3098 log_error("chdir(%s) failed: %m", arg_directory);
3102 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3103 log_error("mount(MS_MOVE) failed: %m");
3107 if (chroot(".") < 0) {
3108 log_error("chroot() failed: %m");
3112 if (chdir("/") < 0) {
3113 log_error("chdir() failed: %m");
3119 if (arg_private_network)
3122 if (drop_capabilities() < 0) {
3123 log_error("drop_capabilities() failed: %m");
3127 r = change_uid_gid(&home);
3131 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3132 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3133 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3138 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3141 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3147 if (fdset_size(fds) > 0) {
3148 k = fdset_cloexec(fds, false);
3150 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3154 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3155 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3163 if (arg_personality != 0xffffffffLU) {
3164 if (personality(arg_personality) < 0) {
3165 log_error("personality() failed: %m");
3168 } else if (secondary) {
3169 if (personality(PER_LINUX32) < 0) {
3170 log_error("personality() failed: %m");
3176 if (arg_selinux_context)
3177 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3178 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3183 if (!strv_isempty(arg_setenv)) {
3186 n = strv_env_merge(2, envp, arg_setenv);
3194 env_use = (char**) envp;
3196 /* Wait until the parent is ready with the setup, too... */
3197 r = eventfd_parent_succeeded(eventfds[0]);
3198 eventfds[0] = safe_close(eventfds[0]);
3206 /* Automatically search for the init system */
3208 l = 1 + argc - optind;
3209 a = newa(char*, l + 1);
3210 memcpy(a + 1, argv + optind, l * sizeof(char*));
3212 a[0] = (char*) "/usr/lib/systemd/systemd";
3213 execve(a[0], a, env_use);
3215 a[0] = (char*) "/lib/systemd/systemd";
3216 execve(a[0], a, env_use);
3218 a[0] = (char*) "/sbin/init";
3219 execve(a[0], a, env_use);
3220 } else if (argc > optind)
3221 execvpe(argv[optind], argv + optind, env_use);
3223 chdir(home ? home : "/root");
3224 execle("/bin/bash", "-bash", NULL, env_use);
3225 execle("/bin/sh", "-sh", NULL, env_use);
3228 log_error("execv() failed: %m");
3231 /* Tell the parent that the setup failed, so he
3232 * can clean up resources and terminate. */
3233 if (eventfds[1] != -1)
3234 eventfd_send_state(eventfds[1],
3235 EVENTFD_CHILD_FAILED);
3236 _exit(EXIT_FAILURE);
3242 /* Wait for the child event:
3243 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3244 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3245 * it is ready with all it needs to do with priviliges.
3246 * After we got the notification we can make the process
3247 * join its cgroup which might limit what it can do */
3248 r = eventfd_child_succeeded(eventfds[1]);
3249 eventfds[1] = safe_close(eventfds[1]);
3251 goto check_container_status;
3253 r = register_machine(pid);
3257 r = move_network_interfaces(pid);
3261 r = setup_veth(pid, veth_name);
3265 r = setup_bridge(veth_name);
3269 r = setup_macvlan(pid);
3273 /* Block SIGCHLD here, before notifying child.
3274 * process_pty() will handle it with the other signals. */
3275 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3279 /* Reset signal to default */
3280 r = default_signals(SIGCHLD, -1);
3284 /* Notify the child that the parent is ready with all
3285 * its setup, and that the child can now hand over
3286 * control to the code to run inside the container. */
3287 r = eventfd_send_state(eventfds[0],
3288 EVENTFD_PARENT_SUCCEEDED);
3289 eventfds[0] = safe_close(eventfds[0]);
3293 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3302 /* Kill if it is not dead yet anyway */
3303 terminate_machine(pid);
3305 check_container_status:
3306 /* Redundant, but better safe than sorry */
3309 r = wait_for_container(pid, &container_status);
3313 /* We failed to wait for the container, or the
3314 * container exited abnormally */
3317 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3318 /* The container exited with a non-zero
3319 * status, or with zero status and no reboot
3323 /* CONTAINER_REBOOTED, loop again */
3327 loop_remove(loop_nr, &image_fd);
3332 free(arg_directory);
3335 strv_free(arg_setenv);
3336 strv_free(arg_network_interfaces);
3337 strv_free(arg_network_macvlan);
3338 strv_free(arg_bind);
3339 strv_free(arg_bind_ro);
3340 strv_free(arg_tmpfs);