1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
90 #include "siphash24.h"
94 #include "seccomp-util.h"
97 typedef enum ContainerStatus {
102 typedef enum LinkJournal {
109 static char *arg_directory = NULL;
110 static char *arg_user = NULL;
111 static sd_id128_t arg_uuid = {};
112 static char *arg_machine = NULL;
113 static const char *arg_selinux_context = NULL;
114 static const char *arg_selinux_apifs_context = NULL;
115 static const char *arg_slice = NULL;
116 static bool arg_private_network = false;
117 static bool arg_read_only = false;
118 static bool arg_boot = false;
119 static LinkJournal arg_link_journal = LINK_AUTO;
120 static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
142 (1ULL << CAP_SYS_RESOURCE) |
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
145 (1ULL << CAP_AUDIT_CONTROL) |
147 static char **arg_bind = NULL;
148 static char **arg_bind_ro = NULL;
149 static char **arg_tmpfs = NULL;
150 static char **arg_setenv = NULL;
151 static bool arg_quiet = false;
152 static bool arg_share_system = false;
153 static bool arg_register = true;
154 static bool arg_keep_unit = false;
155 static char **arg_network_interfaces = NULL;
156 static char **arg_network_macvlan = NULL;
157 static bool arg_network_veth = false;
158 static const char *arg_network_bridge = NULL;
159 static unsigned long arg_personality = 0xffffffffLU;
160 static const char *arg_image = NULL;
162 static int help(void) {
164 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
165 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
166 " -h --help Show this help\n"
167 " --version Print version string\n"
168 " -q --quiet Do not show status information\n"
169 " -D --directory=PATH Root directory for the container\n"
170 " -i --image=PATH File system device or image for the container\n"
171 " -b --boot Boot up full system (i.e. invoke init)\n"
172 " -u --user=USER Run the command under specified user or uid\n"
173 " -M --machine=NAME Set the machine name for the container\n"
174 " --uuid=UUID Set a specific machine UUID for the container\n"
175 " -S --slice=SLICE Place the container in the specified slice\n"
176 " --private-network Disable network in container\n"
177 " --network-interface=INTERFACE\n"
178 " Assign an existing network interface to the\n"
180 " --network-macvlan=INTERFACE\n"
181 " Create a macvlan network interface based on an\n"
182 " existing network interface to the container\n"
183 " --network-veth Add a virtual ethernet connection between host\n"
185 " --network-bridge=INTERFACE\n"
186 " Add a virtual ethernet connection between host\n"
187 " and container and add it to an existing bridge on\n"
189 " -Z --selinux-context=SECLABEL\n"
190 " Set the SELinux security context to be used by\n"
191 " processes in the container\n"
192 " -L --selinux-apifs-context=SECLABEL\n"
193 " Set the SELinux security context to be used by\n"
194 " API/tmpfs file systems in the container\n"
195 " --capability=CAP In addition to the default, retain specified\n"
197 " --drop-capability=CAP Drop the specified capability from the default set\n"
198 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
199 " -j Equivalent to --link-journal=host\n"
200 " --read-only Mount the root directory read-only\n"
201 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
203 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
204 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
205 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
206 " --share-system Share system namespaces with host\n"
207 " --register=BOOLEAN Register container as machine\n"
208 " --keep-unit Do not register a scope for the machine, reuse\n"
209 " the service unit nspawn is running in\n",
210 program_invocation_short_name);
215 static int parse_argv(int argc, char *argv[]) {
232 ARG_NETWORK_INTERFACE,
239 static const struct option options[] = {
240 { "help", no_argument, NULL, 'h' },
241 { "version", no_argument, NULL, ARG_VERSION },
242 { "directory", required_argument, NULL, 'D' },
243 { "user", required_argument, NULL, 'u' },
244 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
245 { "boot", no_argument, NULL, 'b' },
246 { "uuid", required_argument, NULL, ARG_UUID },
247 { "read-only", no_argument, NULL, ARG_READ_ONLY },
248 { "capability", required_argument, NULL, ARG_CAPABILITY },
249 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
250 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
251 { "bind", required_argument, NULL, ARG_BIND },
252 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
253 { "tmpfs", required_argument, NULL, ARG_TMPFS },
254 { "machine", required_argument, NULL, 'M' },
255 { "slice", required_argument, NULL, 'S' },
256 { "setenv", required_argument, NULL, ARG_SETENV },
257 { "selinux-context", required_argument, NULL, 'Z' },
258 { "selinux-apifs-context", required_argument, NULL, 'L' },
259 { "quiet", no_argument, NULL, 'q' },
260 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
261 { "register", required_argument, NULL, ARG_REGISTER },
262 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
263 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
264 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
265 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
266 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
267 { "personality", required_argument, NULL, ARG_PERSONALITY },
268 { "image", required_argument, NULL, 'i' },
273 uint64_t plus = 0, minus = 0;
278 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
286 puts(PACKAGE_STRING);
287 puts(SYSTEMD_FEATURES);
292 arg_directory = canonicalize_file_name(optarg);
293 if (!arg_directory) {
294 log_error("Invalid root directory: %m");
306 arg_user = strdup(optarg);
312 case ARG_NETWORK_BRIDGE:
313 arg_network_bridge = optarg;
317 case ARG_NETWORK_VETH:
318 arg_network_veth = true;
319 arg_private_network = true;
322 case ARG_NETWORK_INTERFACE:
323 if (strv_extend(&arg_network_interfaces, optarg) < 0)
326 arg_private_network = true;
329 case ARG_NETWORK_MACVLAN:
330 if (strv_extend(&arg_network_macvlan, optarg) < 0)
335 case ARG_PRIVATE_NETWORK:
336 arg_private_network = true;
344 r = sd_id128_from_string(optarg, &arg_uuid);
346 log_error("Invalid UUID: %s", optarg);
356 if (isempty(optarg)) {
361 if (!hostname_is_valid(optarg)) {
362 log_error("Invalid machine name: %s", optarg);
367 arg_machine = strdup(optarg);
375 arg_selinux_context = optarg;
379 arg_selinux_apifs_context = optarg;
383 arg_read_only = true;
387 case ARG_DROP_CAPABILITY: {
391 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
392 _cleanup_free_ char *t;
395 t = strndup(word, length);
399 if (streq(t, "all")) {
400 if (c == ARG_CAPABILITY)
401 plus = (uint64_t) -1;
403 minus = (uint64_t) -1;
405 if (cap_from_name(t, &cap) < 0) {
406 log_error("Failed to parse capability %s.", t);
410 if (c == ARG_CAPABILITY)
411 plus |= 1ULL << (uint64_t) cap;
413 minus |= 1ULL << (uint64_t) cap;
421 arg_link_journal = LINK_GUEST;
424 case ARG_LINK_JOURNAL:
425 if (streq(optarg, "auto"))
426 arg_link_journal = LINK_AUTO;
427 else if (streq(optarg, "no"))
428 arg_link_journal = LINK_NO;
429 else if (streq(optarg, "guest"))
430 arg_link_journal = LINK_GUEST;
431 else if (streq(optarg, "host"))
432 arg_link_journal = LINK_HOST;
434 log_error("Failed to parse link journal mode %s", optarg);
442 _cleanup_free_ char *a = NULL, *b = NULL;
446 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
448 e = strchr(optarg, ':');
450 a = strndup(optarg, e - optarg);
460 if (!path_is_absolute(a) || !path_is_absolute(b)) {
461 log_error("Invalid bind mount specification: %s", optarg);
465 r = strv_extend(x, a);
469 r = strv_extend(x, b);
477 _cleanup_free_ char *a = NULL, *b = NULL;
480 e = strchr(optarg, ':');
482 a = strndup(optarg, e - optarg);
486 b = strdup("mode=0755");
492 if (!path_is_absolute(a)) {
493 log_error("Invalid tmpfs specification: %s", optarg);
497 r = strv_push(&arg_tmpfs, a);
503 r = strv_push(&arg_tmpfs, b);
515 if (!env_assignment_is_valid(optarg)) {
516 log_error("Environment variable assignment '%s' is not valid.", optarg);
520 n = strv_env_set(arg_setenv, optarg);
524 strv_free(arg_setenv);
533 case ARG_SHARE_SYSTEM:
534 arg_share_system = true;
538 r = parse_boolean(optarg);
540 log_error("Failed to parse --register= argument: %s", optarg);
548 arg_keep_unit = true;
551 case ARG_PERSONALITY:
553 arg_personality = personality_from_string(optarg);
554 if (arg_personality == 0xffffffffLU) {
555 log_error("Unknown or unsupported personality '%s'.", optarg);
565 assert_not_reached("Unhandled option");
569 if (arg_share_system)
570 arg_register = false;
572 if (arg_boot && arg_share_system) {
573 log_error("--boot and --share-system may not be combined.");
577 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
578 log_error("--keep-unit may not be used when invoked from a user session.");
582 if (arg_directory && arg_image) {
583 log_error("--directory= and --image= may not be combined.");
587 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
592 static int mount_all(const char *dest) {
594 typedef struct MountPoint {
603 static const MountPoint mount_table[] = {
604 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
605 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
606 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
607 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
608 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
609 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
610 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
611 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
613 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
614 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
621 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
622 _cleanup_free_ char *where = NULL;
624 _cleanup_free_ char *options = NULL;
629 where = strjoin(dest, "/", mount_table[k].where, NULL);
633 t = path_is_mount_point(where, true);
635 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
643 /* Skip this entry if it is not a remount. */
644 if (mount_table[k].what && t > 0)
647 mkdir_p(where, 0755);
650 if (arg_selinux_apifs_context &&
651 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
652 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
659 o = mount_table[k].options;
662 if (mount(mount_table[k].what,
665 mount_table[k].flags,
667 mount_table[k].fatal) {
669 log_error("mount(%s) failed: %m", where);
679 static int mount_binds(const char *dest, char **l, bool ro) {
682 STRV_FOREACH_PAIR(x, y, l) {
683 _cleanup_free_ char *where = NULL;
684 struct stat source_st, dest_st;
687 if (stat(*x, &source_st) < 0) {
688 log_error("Failed to stat %s: %m", *x);
692 where = strappend(dest, *y);
696 r = stat(where, &dest_st);
698 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
699 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
702 } else if (errno == ENOENT) {
703 r = mkdir_parents_label(where, 0755);
705 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
709 log_error("Failed to bind mount %s: %m", *x);
713 /* Create the mount point, but be conservative -- refuse to create block
714 * and char devices. */
715 if (S_ISDIR(source_st.st_mode))
716 mkdir_label(where, 0755);
717 else if (S_ISFIFO(source_st.st_mode))
719 else if (S_ISSOCK(source_st.st_mode))
720 mknod(where, 0644 | S_IFSOCK, 0);
721 else if (S_ISREG(source_st.st_mode))
724 log_error("Refusing to create mountpoint for file: %s", *x);
728 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
729 log_error("mount(%s) failed: %m", where);
734 r = bind_remount_recursive(where, true);
736 log_error("Read-Only bind mount failed: %s", strerror(-r));
745 static int mount_tmpfs(const char *dest) {
748 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
749 _cleanup_free_ char *where = NULL;
751 where = strappend(dest, *i);
755 mkdir_label(where, 0755);
757 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
758 log_error("tmpfs mount to %s failed: %m", where);
766 static int setup_timezone(const char *dest) {
767 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
773 /* Fix the timezone, if possible */
774 r = readlink_malloc("/etc/localtime", &p);
776 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
780 z = path_startswith(p, "../usr/share/zoneinfo/");
782 z = path_startswith(p, "/usr/share/zoneinfo/");
784 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
788 where = strappend(dest, "/etc/localtime");
792 r = readlink_malloc(where, &q);
794 y = path_startswith(q, "../usr/share/zoneinfo/");
796 y = path_startswith(q, "/usr/share/zoneinfo/");
799 /* Already pointing to the right place? Then do nothing .. */
800 if (y && streq(y, z))
804 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
808 if (access(check, F_OK) < 0) {
809 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
813 what = strappend("../usr/share/zoneinfo/", z);
818 if (symlink(what, where) < 0) {
819 log_error("Failed to correct timezone of container: %m");
826 static int setup_resolv_conf(const char *dest) {
827 char _cleanup_free_ *where = NULL;
831 if (arg_private_network)
834 /* Fix resolv.conf, if possible */
835 where = strappend(dest, "/etc/resolv.conf");
839 /* We don't really care for the results of this really. If it
840 * fails, it fails, but meh... */
841 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
846 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
849 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
850 SD_ID128_FORMAT_VAL(id));
855 static int setup_boot_id(const char *dest) {
856 _cleanup_free_ char *from = NULL, *to = NULL;
863 if (arg_share_system)
866 /* Generate a new randomized boot ID, so that each boot-up of
867 * the container gets a new one */
869 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
870 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
874 r = sd_id128_randomize(&rnd);
876 log_error("Failed to generate random boot id: %s", strerror(-r));
880 id128_format_as_uuid(rnd, as_uuid);
882 r = write_string_file(from, as_uuid);
884 log_error("Failed to write boot id: %s", strerror(-r));
888 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
889 log_error("Failed to bind mount boot id: %m");
891 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
892 log_warning("Failed to make boot id read-only: %m");
898 static int copy_devnodes(const char *dest) {
900 static const char devnodes[] =
910 _cleanup_umask_ mode_t u;
916 NULSTR_FOREACH(d, devnodes) {
917 _cleanup_free_ char *from = NULL, *to = NULL;
920 from = strappend("/dev/", d);
921 to = strjoin(dest, "/dev/", d, NULL);
925 if (stat(from, &st) < 0) {
927 if (errno != ENOENT) {
928 log_error("Failed to stat %s: %m", from);
932 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
934 log_error("%s is not a char or block device, cannot copy", from);
937 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
939 log_error("mknod(%s) failed: %m", dest);
947 static int setup_ptmx(const char *dest) {
948 _cleanup_free_ char *p = NULL;
950 p = strappend(dest, "/dev/ptmx");
954 if (symlink("pts/ptmx", p) < 0) {
955 log_error("Failed to create /dev/ptmx symlink: %m");
962 static int setup_dev_console(const char *dest, const char *console) {
963 _cleanup_umask_ mode_t u;
973 if (stat("/dev/null", &st) < 0) {
974 log_error("Failed to stat /dev/null: %m");
978 r = chmod_and_chown(console, 0600, 0, 0);
980 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
984 /* We need to bind mount the right tty to /dev/console since
985 * ptys can only exist on pts file systems. To have something
986 * to bind mount things on we create a device node first, and
987 * use /dev/null for that since we the cgroups device policy
988 * allows us to create that freely, while we cannot create
989 * /dev/console. (Note that the major minor doesn't actually
990 * matter here, since we mount it over anyway). */
992 to = strappenda(dest, "/dev/console");
993 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
994 log_error("mknod() for /dev/console failed: %m");
998 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
999 log_error("Bind mount for /dev/console failed: %m");
1006 static int setup_kmsg(const char *dest, int kmsg_socket) {
1007 _cleanup_free_ char *from = NULL, *to = NULL;
1009 _cleanup_umask_ mode_t u;
1011 struct cmsghdr cmsghdr;
1012 uint8_t buf[CMSG_SPACE(sizeof(int))];
1014 struct msghdr mh = {
1015 .msg_control = &control,
1016 .msg_controllen = sizeof(control),
1018 struct cmsghdr *cmsg;
1021 assert(kmsg_socket >= 0);
1025 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1026 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1027 * on the reading side behave very similar to /proc/kmsg,
1028 * their writing side behaves differently from /dev/kmsg in
1029 * that writing blocks when nothing is reading. In order to
1030 * avoid any problems with containers deadlocking due to this
1031 * we simply make /dev/kmsg unavailable to the container. */
1032 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1033 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1036 if (mkfifo(from, 0600) < 0) {
1037 log_error("mkfifo() for /dev/kmsg failed: %m");
1041 r = chmod_and_chown(from, 0600, 0, 0);
1043 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1047 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1048 log_error("Bind mount for /proc/kmsg failed: %m");
1052 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1054 log_error("Failed to open fifo: %m");
1058 cmsg = CMSG_FIRSTHDR(&mh);
1059 cmsg->cmsg_level = SOL_SOCKET;
1060 cmsg->cmsg_type = SCM_RIGHTS;
1061 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1062 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1064 mh.msg_controllen = cmsg->cmsg_len;
1066 /* Store away the fd in the socket, so that it stays open as
1067 * long as we run the child */
1068 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1072 log_error("Failed to send FIFO fd: %m");
1076 /* And now make the FIFO unavailable as /dev/kmsg... */
1081 static int setup_hostname(void) {
1083 if (arg_share_system)
1086 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1092 static int setup_journal(const char *directory) {
1093 sd_id128_t machine_id, this_id;
1094 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1098 p = strappend(directory, "/etc/machine-id");
1102 r = read_one_line_file(p, &b);
1103 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1106 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1111 if (isempty(id) && arg_link_journal == LINK_AUTO)
1114 /* Verify validity */
1115 r = sd_id128_from_string(id, &machine_id);
1117 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1121 r = sd_id128_get_machine(&this_id);
1123 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1127 if (sd_id128_equal(machine_id, this_id)) {
1128 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1129 "Host and machine ids are equal (%s): refusing to link journals", id);
1130 if (arg_link_journal == LINK_AUTO)
1136 if (arg_link_journal == LINK_NO)
1140 p = strappend("/var/log/journal/", id);
1141 q = strjoin(directory, "/var/log/journal/", id, NULL);
1145 if (path_is_mount_point(p, false) > 0) {
1146 if (arg_link_journal != LINK_AUTO) {
1147 log_error("%s: already a mount point, refusing to use for journal", p);
1154 if (path_is_mount_point(q, false) > 0) {
1155 if (arg_link_journal != LINK_AUTO) {
1156 log_error("%s: already a mount point, refusing to use for journal", q);
1163 r = readlink_and_make_absolute(p, &d);
1165 if ((arg_link_journal == LINK_GUEST ||
1166 arg_link_journal == LINK_AUTO) &&
1169 r = mkdir_p(q, 0755);
1171 log_warning("failed to create directory %s: %m", q);
1175 if (unlink(p) < 0) {
1176 log_error("Failed to remove symlink %s: %m", p);
1179 } else if (r == -EINVAL) {
1181 if (arg_link_journal == LINK_GUEST &&
1184 if (errno == ENOTDIR) {
1185 log_error("%s already exists and is neither a symlink nor a directory", p);
1188 log_error("Failed to remove %s: %m", p);
1192 } else if (r != -ENOENT) {
1193 log_error("readlink(%s) failed: %m", p);
1197 if (arg_link_journal == LINK_GUEST) {
1199 if (symlink(q, p) < 0) {
1200 log_error("Failed to symlink %s to %s: %m", q, p);
1204 r = mkdir_p(q, 0755);
1206 log_warning("failed to create directory %s: %m", q);
1210 if (arg_link_journal == LINK_HOST) {
1211 r = mkdir_p(p, 0755);
1213 log_error("Failed to create %s: %m", p);
1217 } else if (access(p, F_OK) < 0)
1220 if (dir_is_empty(q) == 0)
1221 log_warning("%s is not empty, proceeding anyway.", q);
1223 r = mkdir_p(q, 0755);
1225 log_error("Failed to create %s: %m", q);
1229 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1230 log_error("Failed to bind mount journal from host into guest: %m");
1237 static int setup_kdbus(const char *dest, const char *path) {
1243 p = strappenda(dest, "/dev/kdbus");
1244 if (mkdir(p, 0755) < 0) {
1245 log_error("Failed to create kdbus path: %m");
1249 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1250 log_error("Failed to mount kdbus domain path: %m");
1257 static int drop_capabilities(void) {
1258 return capability_bounding_set_drop(~arg_retain, false);
1261 static int register_machine(pid_t pid) {
1262 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1263 _cleanup_bus_unref_ sd_bus *bus = NULL;
1269 r = sd_bus_default_system(&bus);
1271 log_error("Failed to open system bus: %s", strerror(-r));
1275 if (arg_keep_unit) {
1276 r = sd_bus_call_method(
1278 "org.freedesktop.machine1",
1279 "/org/freedesktop/machine1",
1280 "org.freedesktop.machine1.Manager",
1286 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1290 strempty(arg_directory));
1292 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1294 r = sd_bus_message_new_method_call(
1297 "org.freedesktop.machine1",
1298 "/org/freedesktop/machine1",
1299 "org.freedesktop.machine1.Manager",
1302 log_error("Failed to create message: %s", strerror(-r));
1306 r = sd_bus_message_append(
1310 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1314 strempty(arg_directory));
1316 log_error("Failed to append message arguments: %s", strerror(-r));
1320 r = sd_bus_message_open_container(m, 'a', "(sv)");
1322 log_error("Failed to open container: %s", strerror(-r));
1326 if (!isempty(arg_slice)) {
1327 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1329 log_error("Failed to append slice: %s", strerror(-r));
1334 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1336 log_error("Failed to add device policy: %s", strerror(-r));
1340 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1341 /* Allow the container to
1342 * access and create the API
1343 * device nodes, so that
1344 * PrivateDevices= in the
1345 * container can work
1350 "/dev/random", "rwm",
1351 "/dev/urandom", "rwm",
1353 /* Allow the container
1354 * access to ptys. However,
1356 * container to ever create
1357 * these device nodes. */
1358 "/dev/pts/ptmx", "rw",
1360 /* Allow the container
1361 * access to all kdbus
1362 * devices. Again, the
1363 * container cannot create
1364 * these nodes, only use
1365 * them. We use a pretty
1366 * open match here, so that
1367 * the kernel API can still
1370 "char-kdbus/*", "rw");
1372 log_error("Failed to add device whitelist: %s", strerror(-r));
1376 r = sd_bus_message_close_container(m);
1378 log_error("Failed to close container: %s", strerror(-r));
1382 r = sd_bus_call(bus, m, 0, &error, NULL);
1386 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1393 static int terminate_machine(pid_t pid) {
1394 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1395 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1396 _cleanup_bus_unref_ sd_bus *bus = NULL;
1403 r = sd_bus_default_system(&bus);
1405 log_error("Failed to open system bus: %s", strerror(-r));
1409 r = sd_bus_call_method(
1411 "org.freedesktop.machine1",
1412 "/org/freedesktop/machine1",
1413 "org.freedesktop.machine1.Manager",
1420 /* Note that the machine might already have been
1421 * cleaned up automatically, hence don't consider it a
1422 * failure if we cannot get the machine object. */
1423 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1427 r = sd_bus_message_read(reply, "o", &path);
1429 return bus_log_parse_error(r);
1431 r = sd_bus_call_method(
1433 "org.freedesktop.machine1",
1435 "org.freedesktop.machine1.Machine",
1441 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1448 static int reset_audit_loginuid(void) {
1449 _cleanup_free_ char *p = NULL;
1452 if (arg_share_system)
1455 r = read_one_line_file("/proc/self/loginuid", &p);
1459 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1463 /* Already reset? */
1464 if (streq(p, "4294967295"))
1467 r = write_string_file("/proc/self/loginuid", "4294967295");
1469 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1470 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1471 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1472 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1473 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1481 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1483 static int get_mac(struct ether_addr *mac) {
1490 l = strlen(arg_machine);
1491 sz = sizeof(sd_id128_t) + l;
1494 /* fetch some persistent data unique to the host */
1495 r = sd_id128_get_machine((sd_id128_t*) v);
1499 /* combine with some data unique (on this host) to this
1500 * container instance */
1501 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1503 /* Let's hash the host machine ID plus the container name. We
1504 * use a fixed, but originally randomly created hash key here. */
1505 siphash24(result, v, sz, HASH_KEY.bytes);
1507 assert_cc(ETH_ALEN <= sizeof(result));
1508 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1510 /* see eth_random_addr in the kernel */
1511 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1512 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1517 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1518 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1519 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1520 struct ether_addr mac;
1523 if (!arg_private_network)
1526 if (!arg_network_veth)
1529 /* Use two different interface name prefixes depending whether
1530 * we are in bridge mode or not. */
1531 if (arg_network_bridge)
1532 memcpy(iface_name, "vb-", 3);
1534 memcpy(iface_name, "ve-", 3);
1535 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1539 log_error("Failed to generate predictable MAC address for host0");
1543 r = sd_rtnl_open(&rtnl, 0);
1545 log_error("Failed to connect to netlink: %s", strerror(-r));
1549 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1551 log_error("Failed to allocate netlink message: %s", strerror(-r));
1555 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1557 log_error("Failed to add netlink interface name: %s", strerror(-r));
1561 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1563 log_error("Failed to open netlink container: %s", strerror(-r));
1567 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1569 log_error("Failed to open netlink container: %s", strerror(-r));
1573 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1575 log_error("Failed to open netlink container: %s", strerror(-r));
1579 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1581 log_error("Failed to add netlink interface name: %s", strerror(-r));
1585 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1587 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1591 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1593 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1597 r = sd_rtnl_message_close_container(m);
1599 log_error("Failed to close netlink container: %s", strerror(-r));
1603 r = sd_rtnl_message_close_container(m);
1605 log_error("Failed to close netlink container: %s", strerror(-r));
1609 r = sd_rtnl_message_close_container(m);
1611 log_error("Failed to close netlink container: %s", strerror(-r));
1615 r = sd_rtnl_call(rtnl, m, 0, NULL);
1617 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1624 static int setup_bridge(const char veth_name[]) {
1625 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1626 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1629 if (!arg_private_network)
1632 if (!arg_network_veth)
1635 if (!arg_network_bridge)
1638 bridge = (int) if_nametoindex(arg_network_bridge);
1640 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1644 r = sd_rtnl_open(&rtnl, 0);
1646 log_error("Failed to connect to netlink: %s", strerror(-r));
1650 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1652 log_error("Failed to allocate netlink message: %s", strerror(-r));
1656 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1658 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1662 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1664 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1668 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1670 log_error("Failed to add netlink master field: %s", strerror(-r));
1674 r = sd_rtnl_call(rtnl, m, 0, NULL);
1676 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1683 static int parse_interface(struct udev *udev, const char *name) {
1684 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1685 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1688 ifi = (int) if_nametoindex(name);
1690 log_error("Failed to resolve interface %s: %m", name);
1694 sprintf(ifi_str, "n%i", ifi);
1695 d = udev_device_new_from_device_id(udev, ifi_str);
1697 log_error("Failed to get udev device for interface %s: %m", name);
1701 if (udev_device_get_is_initialized(d) <= 0) {
1702 log_error("Network interface %s is not initialized yet.", name);
1709 static int move_network_interfaces(pid_t pid) {
1710 _cleanup_udev_unref_ struct udev *udev = NULL;
1711 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1715 if (!arg_private_network)
1718 if (strv_isempty(arg_network_interfaces))
1721 r = sd_rtnl_open(&rtnl, 0);
1723 log_error("Failed to connect to netlink: %s", strerror(-r));
1729 log_error("Failed to connect to udev.");
1733 STRV_FOREACH(i, arg_network_interfaces) {
1734 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1737 ifi = parse_interface(udev, *i);
1741 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1743 log_error("Failed to allocate netlink message: %s", strerror(-r));
1747 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1749 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1753 r = sd_rtnl_call(rtnl, m, 0, NULL);
1755 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1763 static int setup_macvlan(pid_t pid) {
1764 _cleanup_udev_unref_ struct udev *udev = NULL;
1765 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1769 if (!arg_private_network)
1772 if (strv_isempty(arg_network_macvlan))
1775 r = sd_rtnl_open(&rtnl, 0);
1777 log_error("Failed to connect to netlink: %s", strerror(-r));
1783 log_error("Failed to connect to udev.");
1787 STRV_FOREACH(i, arg_network_macvlan) {
1788 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1789 _cleanup_free_ char *n = NULL;
1792 ifi = parse_interface(udev, *i);
1796 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1798 log_error("Failed to allocate netlink message: %s", strerror(-r));
1802 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1804 log_error("Failed to add netlink interface index: %s", strerror(-r));
1808 n = strappend("mv-", *i);
1812 strshorten(n, IFNAMSIZ-1);
1814 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1816 log_error("Failed to add netlink interface name: %s", strerror(-r));
1820 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1822 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1826 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1828 log_error("Failed to open netlink container: %s", strerror(-r));
1832 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1834 log_error("Failed to open netlink container: %s", strerror(-r));
1838 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1840 log_error("Failed to append macvlan mode: %s", strerror(-r));
1844 r = sd_rtnl_message_close_container(m);
1846 log_error("Failed to close netlink container: %s", strerror(-r));
1850 r = sd_rtnl_message_close_container(m);
1852 log_error("Failed to close netlink container: %s", strerror(-r));
1856 r = sd_rtnl_call(rtnl, m, 0, NULL);
1858 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1866 static int audit_still_doesnt_work_in_containers(void) {
1869 scmp_filter_ctx seccomp;
1873 Audit is broken in containers, much of the userspace audit
1874 hookup will fail if running inside a container. We don't
1875 care and just turn off creation of audit sockets.
1877 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1878 with EAFNOSUPPORT which audit userspace uses as indication
1879 that audit is disabled in the kernel.
1882 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1886 r = seccomp_add_secondary_archs(seccomp);
1888 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1892 r = seccomp_rule_add(
1894 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1897 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1898 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1900 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1904 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1906 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1910 r = seccomp_load(seccomp);
1912 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1915 seccomp_release(seccomp);
1923 static int setup_image(char **device_path, int *loop_nr) {
1924 struct loop_info64 info = {
1925 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1927 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1928 _cleanup_free_ char* loopdev = NULL;
1932 assert(device_path);
1935 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1937 log_error("Failed to open %s: %m", arg_image);
1941 if (fstat(fd, &st) < 0) {
1942 log_error("Failed to stat %s: %m", arg_image);
1946 if (S_ISBLK(st.st_mode)) {
1949 p = strdup(arg_image);
1963 if (!S_ISREG(st.st_mode)) {
1964 log_error("%s is not a regular file or block device: %m", arg_image);
1968 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1970 log_error("Failed to open /dev/loop-control: %m");
1974 nr = ioctl(control, LOOP_CTL_GET_FREE);
1976 log_error("Failed to allocate loop device: %m");
1980 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1983 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1985 log_error("Failed to open loop device %s: %m", loopdev);
1989 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1990 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1995 info.lo_flags |= LO_FLAGS_READ_ONLY;
1997 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1998 log_error("Failed to set loopback settings on %s: %m", loopdev);
2002 *device_path = loopdev;
2013 static int dissect_image(
2015 char **root_device, bool *root_device_rw,
2016 char **home_device, bool *home_device_rw,
2017 char **srv_device, bool *srv_device_rw,
2021 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2022 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2023 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2024 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2025 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2026 _cleanup_udev_unref_ struct udev *udev = NULL;
2027 struct udev_list_entry *first, *item;
2028 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2029 const char *pttype = NULL;
2035 assert(root_device);
2036 assert(home_device);
2040 b = blkid_new_probe();
2045 r = blkid_probe_set_device(b, fd, 0, 0);
2050 log_error("Failed to set device on blkid probe: %m");
2054 blkid_probe_enable_partitions(b, 1);
2055 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2058 r = blkid_do_safeprobe(b);
2059 if (r == -2 || r == 1) {
2060 log_error("Failed to identify any partition table on %s.\n"
2061 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2063 } else if (r != 0) {
2066 log_error("Failed to probe: %m");
2070 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2071 if (!streq_ptr(pttype, "gpt")) {
2072 log_error("Image %s does not carry a GUID Partition Table.\n"
2073 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2078 pl = blkid_probe_get_partitions(b);
2083 log_error("Failed to list partitions of %s", arg_image);
2091 if (fstat(fd, &st) < 0) {
2092 log_error("Failed to stat block device: %m");
2096 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2100 e = udev_enumerate_new(udev);
2104 r = udev_enumerate_add_match_parent(e, d);
2108 r = udev_enumerate_scan_devices(e);
2110 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2114 first = udev_enumerate_get_list_entry(e);
2115 udev_list_entry_foreach(item, first) {
2116 _cleanup_udev_device_unref_ struct udev_device *q;
2117 const char *stype, *node;
2118 unsigned long long flags;
2125 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2130 log_error("Failed to get partition device of %s: %m", arg_image);
2134 qn = udev_device_get_devnum(q);
2138 if (st.st_rdev == qn)
2141 node = udev_device_get_devnode(q);
2145 pp = blkid_partlist_devno_to_partition(pl, qn);
2149 flags = blkid_partition_get_flags(pp);
2150 if (flags & GPT_FLAG_NO_AUTO)
2153 nr = blkid_partition_get_partno(pp);
2157 stype = blkid_partition_get_type_string(pp);
2161 if (sd_id128_from_string(stype, &type_id) < 0)
2164 if (sd_id128_equal(type_id, GPT_HOME)) {
2166 if (home && nr >= home_nr)
2170 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2173 home = strdup(node);
2176 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2178 if (srv && nr >= srv_nr)
2182 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2189 #ifdef GPT_ROOT_NATIVE
2190 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2192 if (root && nr >= root_nr)
2196 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2199 root = strdup(node);
2204 #ifdef GPT_ROOT_SECONDARY
2205 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2207 if (secondary_root && nr >= secondary_root_nr)
2210 secondary_root_nr = nr;
2211 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2214 free(secondary_root);
2215 secondary_root = strdup(node);
2216 if (!secondary_root)
2222 if (!root && !secondary_root) {
2223 log_error("Failed to identify root partition in disk image %s.\n"
2224 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2229 *root_device = root;
2232 *root_device_rw = root_rw;
2234 } else if (secondary_root) {
2235 *root_device = secondary_root;
2236 secondary_root = NULL;
2238 *root_device_rw = secondary_root_rw;
2243 *home_device = home;
2246 *home_device_rw = home_rw;
2253 *srv_device_rw = srv_rw;
2258 log_error("--image= is not supported, compiled without blkid support.");
2263 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2265 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2266 const char *fstype, *p;
2276 p = strappenda(where, directory);
2281 b = blkid_new_probe_from_filename(what);
2285 log_error("Failed to allocate prober for %s: %m", what);
2289 blkid_probe_enable_superblocks(b, 1);
2290 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2293 r = blkid_do_safeprobe(b);
2294 if (r == -1 || r == 1) {
2295 log_error("Cannot determine file system type of %s", what);
2297 } else if (r != 0) {
2300 log_error("Failed to probe %s: %m", what);
2305 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2308 log_error("Failed to determine file system type of %s", what);
2312 if (streq(fstype, "crypto_LUKS")) {
2313 log_error("nspawn currently does not support LUKS disk images.");
2317 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2318 log_error("Failed to mount %s: %m", what);
2324 log_error("--image= is not supported, compiled without blkid support.");
2329 static int mount_devices(
2331 const char *root_device, bool root_device_rw,
2332 const char *home_device, bool home_device_rw,
2333 const char *srv_device, bool srv_device_rw) {
2339 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2341 log_error("Failed to mount root directory: %s", strerror(-r));
2347 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2349 log_error("Failed to mount home directory: %s", strerror(-r));
2355 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2357 log_error("Failed to mount server data directory: %s", strerror(-r));
2365 static void loop_remove(int nr, int *image_fd) {
2366 _cleanup_close_ int control = -1;
2371 if (image_fd && *image_fd >= 0) {
2372 ioctl(*image_fd, LOOP_CLR_FD);
2373 *image_fd = safe_close(*image_fd);
2376 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2380 ioctl(control, LOOP_CTL_REMOVE, nr);
2383 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2391 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2392 log_error("Failed to allocate pipe: %m");
2398 log_error("Failed to fork getent child: %m");
2400 } else if (pid == 0) {
2402 char *empty_env = NULL;
2404 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2405 _exit(EXIT_FAILURE);
2407 if (pipe_fds[0] > 2)
2408 safe_close(pipe_fds[0]);
2409 if (pipe_fds[1] > 2)
2410 safe_close(pipe_fds[1]);
2412 nullfd = open("/dev/null", O_RDWR);
2414 _exit(EXIT_FAILURE);
2416 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2417 _exit(EXIT_FAILURE);
2419 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2420 _exit(EXIT_FAILURE);
2425 reset_all_signal_handlers();
2426 close_all_fds(NULL, 0);
2428 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2429 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2430 _exit(EXIT_FAILURE);
2433 pipe_fds[1] = safe_close(pipe_fds[1]);
2440 static int change_uid_gid(char **_home) {
2441 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2442 _cleanup_free_ uid_t *uids = NULL;
2443 _cleanup_free_ char *home = NULL;
2444 _cleanup_fclose_ FILE *f = NULL;
2445 _cleanup_close_ int fd = -1;
2446 unsigned n_uids = 0;
2455 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2456 /* Reset everything fully to 0, just in case */
2458 if (setgroups(0, NULL) < 0) {
2459 log_error("setgroups() failed: %m");
2463 if (setresgid(0, 0, 0) < 0) {
2464 log_error("setregid() failed: %m");
2468 if (setresuid(0, 0, 0) < 0) {
2469 log_error("setreuid() failed: %m");
2477 /* First, get user credentials */
2478 fd = spawn_getent("passwd", arg_user, &pid);
2482 f = fdopen(fd, "r");
2487 if (!fgets(line, sizeof(line), f)) {
2490 log_error("Failed to resolve user %s.", arg_user);
2494 log_error("Failed to read from getent: %m");
2500 wait_for_terminate_and_warn("getent passwd", pid);
2502 x = strchr(line, ':');
2504 log_error("/etc/passwd entry has invalid user field.");
2508 u = strchr(x+1, ':');
2510 log_error("/etc/passwd entry has invalid password field.");
2517 log_error("/etc/passwd entry has invalid UID field.");
2525 log_error("/etc/passwd entry has invalid GID field.");
2530 h = strchr(x+1, ':');
2532 log_error("/etc/passwd entry has invalid GECOS field.");
2539 log_error("/etc/passwd entry has invalid home directory field.");
2545 r = parse_uid(u, &uid);
2547 log_error("Failed to parse UID of user.");
2551 r = parse_gid(g, &gid);
2553 log_error("Failed to parse GID of user.");
2561 /* Second, get group memberships */
2562 fd = spawn_getent("initgroups", arg_user, &pid);
2567 f = fdopen(fd, "r");
2572 if (!fgets(line, sizeof(line), f)) {
2574 log_error("Failed to resolve user %s.", arg_user);
2578 log_error("Failed to read from getent: %m");
2584 wait_for_terminate_and_warn("getent initgroups", pid);
2586 /* Skip over the username and subsequent separator whitespace */
2588 x += strcspn(x, WHITESPACE);
2589 x += strspn(x, WHITESPACE);
2591 FOREACH_WORD(w, l, x, state) {
2597 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2600 r = parse_uid(c, &uids[n_uids++]);
2602 log_error("Failed to parse group data from getent.");
2607 r = mkdir_parents(home, 0775);
2609 log_error("Failed to make home root directory: %s", strerror(-r));
2613 r = mkdir_safe(home, 0755, uid, gid);
2614 if (r < 0 && r != -EEXIST) {
2615 log_error("Failed to make home directory: %s", strerror(-r));
2619 fchown(STDIN_FILENO, uid, gid);
2620 fchown(STDOUT_FILENO, uid, gid);
2621 fchown(STDERR_FILENO, uid, gid);
2623 if (setgroups(n_uids, uids) < 0) {
2624 log_error("Failed to set auxiliary groups: %m");
2628 if (setresgid(gid, gid, gid) < 0) {
2629 log_error("setregid() failed: %m");
2633 if (setresuid(uid, uid, uid) < 0) {
2634 log_error("setreuid() failed: %m");
2647 * Return 0 in case the container is being rebooted, has been shut
2648 * down or exited successfully. On failures a negative value is
2651 * The status of the container "CONTAINER_TERMINATED" or
2652 * "CONTAINER_REBOOTED" will be saved in the container argument
2654 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2658 r = wait_for_terminate(pid, &status);
2662 switch (status.si_code) {
2664 r = status.si_status;
2667 log_debug("Container %s exited successfully.",
2670 *container = CONTAINER_TERMINATED;
2672 log_error("Container %s failed with error code %i.",
2673 arg_machine, status.si_status);
2679 if (status.si_status == SIGINT) {
2681 log_info("Container %s has been shut down.",
2684 *container = CONTAINER_TERMINATED;
2687 } else if (status.si_status == SIGHUP) {
2689 log_info("Container %s is being rebooted.",
2692 *container = CONTAINER_REBOOTED;
2696 /* CLD_KILLED fallthrough */
2699 log_error("Container %s terminated by signal %s.",
2700 arg_machine, signal_to_string(status.si_status));
2705 log_error("Container %s failed due to unknown reason.",
2714 static void nop_handler(int sig) {}
2716 int main(int argc, char *argv[]) {
2718 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2719 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2720 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2721 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2722 _cleanup_fdset_free_ FDSet *fds = NULL;
2723 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2724 const char *console = NULL;
2725 char veth_name[IFNAMSIZ];
2726 bool secondary = false;
2727 sigset_t mask, mask_chld;
2730 log_parse_environment();
2733 k = parse_argv(argc, argv);
2742 if (arg_directory) {
2745 p = path_make_absolute_cwd(arg_directory);
2746 free(arg_directory);
2749 arg_directory = get_current_dir_name();
2751 if (!arg_directory) {
2752 log_error("Failed to determine path, please use -D.");
2755 path_kill_slashes(arg_directory);
2759 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2765 hostname_cleanup(arg_machine, false);
2766 if (isempty(arg_machine)) {
2767 log_error("Failed to determine machine name automatically, please use -M.");
2772 if (geteuid() != 0) {
2773 log_error("Need to be root.");
2777 if (sd_booted() <= 0) {
2778 log_error("Not running on a systemd system.");
2783 n_fd_passed = sd_listen_fds(false);
2784 if (n_fd_passed > 0) {
2785 k = fdset_new_listen_fds(&fds, false);
2787 log_error("Failed to collect file descriptors: %s", strerror(-k));
2791 fdset_close_others(fds);
2794 if (arg_directory) {
2795 if (path_equal(arg_directory, "/")) {
2796 log_error("Spawning container on root directory not supported.");
2801 if (path_is_os_tree(arg_directory) <= 0) {
2802 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2808 p = strappenda(arg_directory,
2809 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2810 if (access(p, F_OK) < 0) {
2811 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2817 char template[] = "/tmp/nspawn-root-XXXXXX";
2819 if (!mkdtemp(template)) {
2820 log_error("Failed to create temporary directory: %m");
2825 arg_directory = strdup(template);
2826 if (!arg_directory) {
2831 image_fd = setup_image(&device_path, &loop_nr);
2837 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2842 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2844 log_error("Failed to acquire pseudo tty: %m");
2848 console = ptsname(master);
2850 log_error("Failed to determine tty name: %m");
2855 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2856 arg_machine, arg_image ? arg_image : arg_directory);
2858 if (unlockpt(master) < 0) {
2859 log_error("Failed to unlock tty: %m");
2863 if (access("/dev/kdbus/control", F_OK) >= 0) {
2865 if (arg_share_system) {
2866 kdbus_domain = strdup("/dev/kdbus");
2867 if (!kdbus_domain) {
2874 ns = strappenda("machine-", arg_machine);
2875 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2877 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2879 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2883 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2884 log_error("Failed to create kmsg socket pair: %m");
2888 sd_notify(0, "READY=1");
2890 assert_se(sigemptyset(&mask) == 0);
2891 assert_se(sigemptyset(&mask_chld) == 0);
2892 sigaddset(&mask_chld, SIGCHLD);
2893 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2894 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2897 ContainerStatus container_status;
2898 int eventfds[2] = { -1, -1 };
2899 struct sigaction sa = {
2900 .sa_handler = nop_handler,
2901 .sa_flags = SA_NOCLDSTOP,
2904 /* Child can be killed before execv(), so handle SIGCHLD
2905 * in order to interrupt parent's blocking calls and
2906 * give it a chance to call wait() and terminate. */
2907 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2909 log_error("Failed to change the signal mask: %m");
2913 r = sigaction(SIGCHLD, &sa, NULL);
2915 log_error("Failed to install SIGCHLD handler: %m");
2919 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2920 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2921 (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2923 if (errno == EINVAL)
2924 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2926 log_error("clone() failed: %m");
2934 _cleanup_free_ char *home = NULL;
2936 const char *envp[] = {
2937 "PATH=" DEFAULT_PATH_SPLIT_USR,
2938 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2943 NULL, /* container_uuid */
2944 NULL, /* LISTEN_FDS */
2945 NULL, /* LISTEN_PID */
2950 envp[n_env] = strv_find_prefix(environ, "TERM=");
2954 master = safe_close(master);
2956 close_nointr(STDIN_FILENO);
2957 close_nointr(STDOUT_FILENO);
2958 close_nointr(STDERR_FILENO);
2960 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2962 reset_all_signal_handlers();
2964 assert_se(sigemptyset(&mask) == 0);
2965 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2967 k = open_terminal(console, O_RDWR);
2968 if (k != STDIN_FILENO) {
2974 log_error("Failed to open console: %s", strerror(-k));
2978 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2979 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2980 log_error("Failed to duplicate console: %m");
2985 log_error("setsid() failed: %m");
2989 if (reset_audit_loginuid() < 0)
2992 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2993 log_error("PR_SET_PDEATHSIG failed: %m");
2997 /* Mark everything as slave, so that we still
2998 * receive mounts from the real root, but don't
2999 * propagate mounts to the real root. */
3000 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3001 log_error("MS_SLAVE|MS_REC failed: %m");
3005 if (mount_devices(arg_directory,
3006 root_device, root_device_rw,
3007 home_device, home_device_rw,
3008 srv_device, srv_device_rw) < 0)
3011 /* Turn directory into bind mount */
3012 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3013 log_error("Failed to make bind mount: %m");
3017 if (arg_read_only) {
3018 k = bind_remount_recursive(arg_directory, true);
3020 log_error("Failed to make tree read-only: %s", strerror(-k));
3025 if (mount_all(arg_directory) < 0)
3028 if (copy_devnodes(arg_directory) < 0)
3031 if (setup_ptmx(arg_directory) < 0)
3034 dev_setup(arg_directory);
3036 if (audit_still_doesnt_work_in_containers() < 0)
3039 if (setup_dev_console(arg_directory, console) < 0)
3042 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3045 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3047 if (setup_boot_id(arg_directory) < 0)
3050 if (setup_timezone(arg_directory) < 0)
3053 if (setup_resolv_conf(arg_directory) < 0)
3056 if (setup_journal(arg_directory) < 0)
3059 if (mount_binds(arg_directory, arg_bind, false) < 0)
3062 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3065 if (mount_tmpfs(arg_directory) < 0)
3068 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3071 /* Tell the parent that we are ready, and that
3072 * it can cgroupify us to that we lack access
3073 * to certain devices and resources. */
3074 r = eventfd_send_state(eventfds[1],
3075 EVENTFD_CHILD_SUCCEEDED);
3076 eventfds[1] = safe_close(eventfds[1]);
3080 if (chdir(arg_directory) < 0) {
3081 log_error("chdir(%s) failed: %m", arg_directory);
3085 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3086 log_error("mount(MS_MOVE) failed: %m");
3090 if (chroot(".") < 0) {
3091 log_error("chroot() failed: %m");
3095 if (chdir("/") < 0) {
3096 log_error("chdir() failed: %m");
3102 if (arg_private_network)
3105 if (drop_capabilities() < 0) {
3106 log_error("drop_capabilities() failed: %m");
3110 r = change_uid_gid(&home);
3114 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3115 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3116 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3121 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3124 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3130 if (fdset_size(fds) > 0) {
3131 k = fdset_cloexec(fds, false);
3133 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3137 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3138 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3146 if (arg_personality != 0xffffffffLU) {
3147 if (personality(arg_personality) < 0) {
3148 log_error("personality() failed: %m");
3151 } else if (secondary) {
3152 if (personality(PER_LINUX32) < 0) {
3153 log_error("personality() failed: %m");
3159 if (arg_selinux_context)
3160 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3161 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3166 if (!strv_isempty(arg_setenv)) {
3169 n = strv_env_merge(2, envp, arg_setenv);
3177 env_use = (char**) envp;
3179 /* Wait until the parent is ready with the setup, too... */
3180 r = eventfd_parent_succeeded(eventfds[0]);
3181 eventfds[0] = safe_close(eventfds[0]);
3189 /* Automatically search for the init system */
3191 l = 1 + argc - optind;
3192 a = newa(char*, l + 1);
3193 memcpy(a + 1, argv + optind, l * sizeof(char*));
3195 a[0] = (char*) "/usr/lib/systemd/systemd";
3196 execve(a[0], a, env_use);
3198 a[0] = (char*) "/lib/systemd/systemd";
3199 execve(a[0], a, env_use);
3201 a[0] = (char*) "/sbin/init";
3202 execve(a[0], a, env_use);
3203 } else if (argc > optind)
3204 execvpe(argv[optind], argv + optind, env_use);
3206 chdir(home ? home : "/root");
3207 execle("/bin/bash", "-bash", NULL, env_use);
3208 execle("/bin/sh", "-sh", NULL, env_use);
3211 log_error("execv() failed: %m");
3214 /* Tell the parent that the setup failed, so he
3215 * can clean up resources and terminate. */
3216 if (eventfds[1] != -1)
3217 eventfd_send_state(eventfds[1],
3218 EVENTFD_CHILD_FAILED);
3219 _exit(EXIT_FAILURE);
3225 /* Wait for the child event:
3226 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3227 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3228 * it is ready with all it needs to do with priviliges.
3229 * After we got the notification we can make the process
3230 * join its cgroup which might limit what it can do */
3231 r = eventfd_child_succeeded(eventfds[1]);
3232 eventfds[1] = safe_close(eventfds[1]);
3234 goto check_container_status;
3236 r = register_machine(pid);
3240 r = move_network_interfaces(pid);
3244 r = setup_veth(pid, veth_name);
3248 r = setup_bridge(veth_name);
3252 r = setup_macvlan(pid);
3256 /* Block SIGCHLD here, before notifying child.
3257 * process_pty() will handle it with the other signals. */
3258 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3262 /* Reset signal to default */
3263 r = default_signals(SIGCHLD, -1);
3267 /* Notify the child that the parent is ready with all
3268 * its setup, and that the child can now hand over
3269 * control to the code to run inside the container. */
3270 r = eventfd_send_state(eventfds[0],
3271 EVENTFD_PARENT_SUCCEEDED);
3272 eventfds[0] = safe_close(eventfds[0]);
3276 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3285 /* Kill if it is not dead yet anyway */
3286 terminate_machine(pid);
3288 check_container_status:
3289 /* Redundant, but better safe than sorry */
3292 r = wait_for_container(pid, &container_status);
3298 } else if (container_status == CONTAINER_TERMINATED)
3301 /* CONTAINER_REBOOTED, loop again */
3305 loop_remove(loop_nr, &image_fd);
3310 free(arg_directory);
3313 strv_free(arg_setenv);
3314 strv_free(arg_network_interfaces);
3315 strv_free(arg_network_macvlan);
3316 strv_free(arg_bind);
3317 strv_free(arg_bind_ro);
3318 strv_free(arg_tmpfs);