1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
90 #include "siphash24.h"
92 #include "base-filesystem.h"
95 #include "seccomp-util.h"
98 typedef enum ContainerStatus {
103 typedef enum LinkJournal {
110 static char *arg_directory = NULL;
111 static char *arg_user = NULL;
112 static sd_id128_t arg_uuid = {};
113 static char *arg_machine = NULL;
114 static const char *arg_selinux_context = NULL;
115 static const char *arg_selinux_apifs_context = NULL;
116 static const char *arg_slice = NULL;
117 static bool arg_private_network = false;
118 static bool arg_read_only = false;
119 static bool arg_boot = false;
120 static LinkJournal arg_link_journal = LINK_AUTO;
121 static uint64_t arg_retain =
122 (1ULL << CAP_CHOWN) |
123 (1ULL << CAP_DAC_OVERRIDE) |
124 (1ULL << CAP_DAC_READ_SEARCH) |
125 (1ULL << CAP_FOWNER) |
126 (1ULL << CAP_FSETID) |
127 (1ULL << CAP_IPC_OWNER) |
129 (1ULL << CAP_LEASE) |
130 (1ULL << CAP_LINUX_IMMUTABLE) |
131 (1ULL << CAP_NET_BIND_SERVICE) |
132 (1ULL << CAP_NET_BROADCAST) |
133 (1ULL << CAP_NET_RAW) |
134 (1ULL << CAP_SETGID) |
135 (1ULL << CAP_SETFCAP) |
136 (1ULL << CAP_SETPCAP) |
137 (1ULL << CAP_SETUID) |
138 (1ULL << CAP_SYS_ADMIN) |
139 (1ULL << CAP_SYS_CHROOT) |
140 (1ULL << CAP_SYS_NICE) |
141 (1ULL << CAP_SYS_PTRACE) |
142 (1ULL << CAP_SYS_TTY_CONFIG) |
143 (1ULL << CAP_SYS_RESOURCE) |
144 (1ULL << CAP_SYS_BOOT) |
145 (1ULL << CAP_AUDIT_WRITE) |
146 (1ULL << CAP_AUDIT_CONTROL) |
148 static char **arg_bind = NULL;
149 static char **arg_bind_ro = NULL;
150 static char **arg_tmpfs = NULL;
151 static char **arg_setenv = NULL;
152 static bool arg_quiet = false;
153 static bool arg_share_system = false;
154 static bool arg_register = true;
155 static bool arg_keep_unit = false;
156 static char **arg_network_interfaces = NULL;
157 static char **arg_network_macvlan = NULL;
158 static bool arg_network_veth = false;
159 static const char *arg_network_bridge = NULL;
160 static unsigned long arg_personality = 0xffffffffLU;
161 static const char *arg_image = NULL;
163 static int help(void) {
165 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
166 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
167 " -h --help Show this help\n"
168 " --version Print version string\n"
169 " -q --quiet Do not show status information\n"
170 " -D --directory=PATH Root directory for the container\n"
171 " -i --image=PATH File system device or image for the container\n"
172 " -b --boot Boot up full system (i.e. invoke init)\n"
173 " -u --user=USER Run the command under specified user or uid\n"
174 " -M --machine=NAME Set the machine name for the container\n"
175 " --uuid=UUID Set a specific machine UUID for the container\n"
176 " -S --slice=SLICE Place the container in the specified slice\n"
177 " --private-network Disable network in container\n"
178 " --network-interface=INTERFACE\n"
179 " Assign an existing network interface to the\n"
181 " --network-macvlan=INTERFACE\n"
182 " Create a macvlan network interface based on an\n"
183 " existing network interface to the container\n"
184 " --network-veth Add a virtual ethernet connection between host\n"
186 " --network-bridge=INTERFACE\n"
187 " Add a virtual ethernet connection between host\n"
188 " and container and add it to an existing bridge on\n"
190 " -Z --selinux-context=SECLABEL\n"
191 " Set the SELinux security context to be used by\n"
192 " processes in the container\n"
193 " -L --selinux-apifs-context=SECLABEL\n"
194 " Set the SELinux security context to be used by\n"
195 " API/tmpfs file systems in the container\n"
196 " --capability=CAP In addition to the default, retain specified\n"
198 " --drop-capability=CAP Drop the specified capability from the default set\n"
199 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
200 " -j Equivalent to --link-journal=host\n"
201 " --read-only Mount the root directory read-only\n"
202 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
204 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
205 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
206 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
207 " --share-system Share system namespaces with host\n"
208 " --register=BOOLEAN Register container as machine\n"
209 " --keep-unit Do not register a scope for the machine, reuse\n"
210 " the service unit nspawn is running in\n",
211 program_invocation_short_name);
216 static int parse_argv(int argc, char *argv[]) {
233 ARG_NETWORK_INTERFACE,
240 static const struct option options[] = {
241 { "help", no_argument, NULL, 'h' },
242 { "version", no_argument, NULL, ARG_VERSION },
243 { "directory", required_argument, NULL, 'D' },
244 { "user", required_argument, NULL, 'u' },
245 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
246 { "boot", no_argument, NULL, 'b' },
247 { "uuid", required_argument, NULL, ARG_UUID },
248 { "read-only", no_argument, NULL, ARG_READ_ONLY },
249 { "capability", required_argument, NULL, ARG_CAPABILITY },
250 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
251 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
252 { "bind", required_argument, NULL, ARG_BIND },
253 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
254 { "tmpfs", required_argument, NULL, ARG_TMPFS },
255 { "machine", required_argument, NULL, 'M' },
256 { "slice", required_argument, NULL, 'S' },
257 { "setenv", required_argument, NULL, ARG_SETENV },
258 { "selinux-context", required_argument, NULL, 'Z' },
259 { "selinux-apifs-context", required_argument, NULL, 'L' },
260 { "quiet", no_argument, NULL, 'q' },
261 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
262 { "register", required_argument, NULL, ARG_REGISTER },
263 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
264 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
265 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
266 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
267 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
268 { "personality", required_argument, NULL, ARG_PERSONALITY },
269 { "image", required_argument, NULL, 'i' },
274 uint64_t plus = 0, minus = 0;
279 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
287 puts(PACKAGE_STRING);
288 puts(SYSTEMD_FEATURES);
293 arg_directory = canonicalize_file_name(optarg);
294 if (!arg_directory) {
295 log_error("Invalid root directory: %m");
307 arg_user = strdup(optarg);
313 case ARG_NETWORK_BRIDGE:
314 arg_network_bridge = optarg;
318 case ARG_NETWORK_VETH:
319 arg_network_veth = true;
320 arg_private_network = true;
323 case ARG_NETWORK_INTERFACE:
324 if (strv_extend(&arg_network_interfaces, optarg) < 0)
327 arg_private_network = true;
330 case ARG_NETWORK_MACVLAN:
331 if (strv_extend(&arg_network_macvlan, optarg) < 0)
336 case ARG_PRIVATE_NETWORK:
337 arg_private_network = true;
345 r = sd_id128_from_string(optarg, &arg_uuid);
347 log_error("Invalid UUID: %s", optarg);
357 if (isempty(optarg)) {
362 if (!hostname_is_valid(optarg)) {
363 log_error("Invalid machine name: %s", optarg);
368 arg_machine = strdup(optarg);
376 arg_selinux_context = optarg;
380 arg_selinux_apifs_context = optarg;
384 arg_read_only = true;
388 case ARG_DROP_CAPABILITY: {
392 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
393 _cleanup_free_ char *t;
396 t = strndup(word, length);
400 if (streq(t, "all")) {
401 if (c == ARG_CAPABILITY)
402 plus = (uint64_t) -1;
404 minus = (uint64_t) -1;
406 if (cap_from_name(t, &cap) < 0) {
407 log_error("Failed to parse capability %s.", t);
411 if (c == ARG_CAPABILITY)
412 plus |= 1ULL << (uint64_t) cap;
414 minus |= 1ULL << (uint64_t) cap;
422 arg_link_journal = LINK_GUEST;
425 case ARG_LINK_JOURNAL:
426 if (streq(optarg, "auto"))
427 arg_link_journal = LINK_AUTO;
428 else if (streq(optarg, "no"))
429 arg_link_journal = LINK_NO;
430 else if (streq(optarg, "guest"))
431 arg_link_journal = LINK_GUEST;
432 else if (streq(optarg, "host"))
433 arg_link_journal = LINK_HOST;
435 log_error("Failed to parse link journal mode %s", optarg);
443 _cleanup_free_ char *a = NULL, *b = NULL;
447 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
449 e = strchr(optarg, ':');
451 a = strndup(optarg, e - optarg);
461 if (!path_is_absolute(a) || !path_is_absolute(b)) {
462 log_error("Invalid bind mount specification: %s", optarg);
466 r = strv_extend(x, a);
470 r = strv_extend(x, b);
478 _cleanup_free_ char *a = NULL, *b = NULL;
481 e = strchr(optarg, ':');
483 a = strndup(optarg, e - optarg);
487 b = strdup("mode=0755");
493 if (!path_is_absolute(a)) {
494 log_error("Invalid tmpfs specification: %s", optarg);
498 r = strv_push(&arg_tmpfs, a);
504 r = strv_push(&arg_tmpfs, b);
516 if (!env_assignment_is_valid(optarg)) {
517 log_error("Environment variable assignment '%s' is not valid.", optarg);
521 n = strv_env_set(arg_setenv, optarg);
525 strv_free(arg_setenv);
534 case ARG_SHARE_SYSTEM:
535 arg_share_system = true;
539 r = parse_boolean(optarg);
541 log_error("Failed to parse --register= argument: %s", optarg);
549 arg_keep_unit = true;
552 case ARG_PERSONALITY:
554 arg_personality = personality_from_string(optarg);
555 if (arg_personality == 0xffffffffLU) {
556 log_error("Unknown or unsupported personality '%s'.", optarg);
566 assert_not_reached("Unhandled option");
570 if (arg_share_system)
571 arg_register = false;
573 if (arg_boot && arg_share_system) {
574 log_error("--boot and --share-system may not be combined.");
578 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
579 log_error("--keep-unit may not be used when invoked from a user session.");
583 if (arg_directory && arg_image) {
584 log_error("--directory= and --image= may not be combined.");
588 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
593 static int mount_all(const char *dest) {
595 typedef struct MountPoint {
604 static const MountPoint mount_table[] = {
605 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
606 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
607 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
608 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
609 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
610 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
611 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
612 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
614 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
615 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
622 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
623 _cleanup_free_ char *where = NULL;
625 _cleanup_free_ char *options = NULL;
630 where = strjoin(dest, "/", mount_table[k].where, NULL);
634 t = path_is_mount_point(where, true);
636 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
644 /* Skip this entry if it is not a remount. */
645 if (mount_table[k].what && t > 0)
648 mkdir_p(where, 0755);
651 if (arg_selinux_apifs_context &&
652 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
653 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
660 o = mount_table[k].options;
663 if (mount(mount_table[k].what,
666 mount_table[k].flags,
668 mount_table[k].fatal) {
670 log_error("mount(%s) failed: %m", where);
680 static int mount_binds(const char *dest, char **l, bool ro) {
683 STRV_FOREACH_PAIR(x, y, l) {
684 _cleanup_free_ char *where = NULL;
685 struct stat source_st, dest_st;
688 if (stat(*x, &source_st) < 0) {
689 log_error("Failed to stat %s: %m", *x);
693 where = strappend(dest, *y);
697 r = stat(where, &dest_st);
699 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
700 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
703 } else if (errno == ENOENT) {
704 r = mkdir_parents_label(where, 0755);
706 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
710 log_error("Failed to bind mount %s: %m", *x);
714 /* Create the mount point, but be conservative -- refuse to create block
715 * and char devices. */
716 if (S_ISDIR(source_st.st_mode))
717 mkdir_label(where, 0755);
718 else if (S_ISFIFO(source_st.st_mode))
720 else if (S_ISSOCK(source_st.st_mode))
721 mknod(where, 0644 | S_IFSOCK, 0);
722 else if (S_ISREG(source_st.st_mode))
725 log_error("Refusing to create mountpoint for file: %s", *x);
729 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
730 log_error("mount(%s) failed: %m", where);
735 r = bind_remount_recursive(where, true);
737 log_error("Read-Only bind mount failed: %s", strerror(-r));
746 static int mount_tmpfs(const char *dest) {
749 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
750 _cleanup_free_ char *where = NULL;
752 where = strappend(dest, *i);
756 mkdir_label(where, 0755);
758 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
759 log_error("tmpfs mount to %s failed: %m", where);
767 static int setup_timezone(const char *dest) {
768 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
774 /* Fix the timezone, if possible */
775 r = readlink_malloc("/etc/localtime", &p);
777 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
781 z = path_startswith(p, "../usr/share/zoneinfo/");
783 z = path_startswith(p, "/usr/share/zoneinfo/");
785 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
789 where = strappend(dest, "/etc/localtime");
793 r = readlink_malloc(where, &q);
795 y = path_startswith(q, "../usr/share/zoneinfo/");
797 y = path_startswith(q, "/usr/share/zoneinfo/");
800 /* Already pointing to the right place? Then do nothing .. */
801 if (y && streq(y, z))
805 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
809 if (access(check, F_OK) < 0) {
810 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
814 what = strappend("../usr/share/zoneinfo/", z);
819 if (symlink(what, where) < 0) {
820 log_error("Failed to correct timezone of container: %m");
827 static int setup_resolv_conf(const char *dest) {
828 _cleanup_free_ char *where = NULL;
832 if (arg_private_network)
835 /* Fix resolv.conf, if possible */
836 where = strappend(dest, "/etc/resolv.conf");
840 /* We don't really care for the results of this really. If it
841 * fails, it fails, but meh... */
842 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
847 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
850 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
851 SD_ID128_FORMAT_VAL(id));
856 static int setup_boot_id(const char *dest) {
857 _cleanup_free_ char *from = NULL, *to = NULL;
864 if (arg_share_system)
867 /* Generate a new randomized boot ID, so that each boot-up of
868 * the container gets a new one */
870 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
871 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
875 r = sd_id128_randomize(&rnd);
877 log_error("Failed to generate random boot id: %s", strerror(-r));
881 id128_format_as_uuid(rnd, as_uuid);
883 r = write_string_file(from, as_uuid);
885 log_error("Failed to write boot id: %s", strerror(-r));
889 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
890 log_error("Failed to bind mount boot id: %m");
892 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
893 log_warning("Failed to make boot id read-only: %m");
899 static int copy_devnodes(const char *dest) {
901 static const char devnodes[] =
911 _cleanup_umask_ mode_t u;
917 NULSTR_FOREACH(d, devnodes) {
918 _cleanup_free_ char *from = NULL, *to = NULL;
921 from = strappend("/dev/", d);
922 to = strjoin(dest, "/dev/", d, NULL);
926 if (stat(from, &st) < 0) {
928 if (errno != ENOENT) {
929 log_error("Failed to stat %s: %m", from);
933 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
935 log_error("%s is not a char or block device, cannot copy", from);
938 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
940 log_error("mknod(%s) failed: %m", dest);
948 static int setup_ptmx(const char *dest) {
949 _cleanup_free_ char *p = NULL;
951 p = strappend(dest, "/dev/ptmx");
955 if (symlink("pts/ptmx", p) < 0) {
956 log_error("Failed to create /dev/ptmx symlink: %m");
963 static int setup_dev_console(const char *dest, const char *console) {
964 _cleanup_umask_ mode_t u;
974 if (stat("/dev/null", &st) < 0) {
975 log_error("Failed to stat /dev/null: %m");
979 r = chmod_and_chown(console, 0600, 0, 0);
981 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
985 /* We need to bind mount the right tty to /dev/console since
986 * ptys can only exist on pts file systems. To have something
987 * to bind mount things on we create a device node first, and
988 * use /dev/null for that since we the cgroups device policy
989 * allows us to create that freely, while we cannot create
990 * /dev/console. (Note that the major minor doesn't actually
991 * matter here, since we mount it over anyway). */
993 to = strappenda(dest, "/dev/console");
994 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
995 log_error("mknod() for /dev/console failed: %m");
999 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1000 log_error("Bind mount for /dev/console failed: %m");
1007 static int setup_kmsg(const char *dest, int kmsg_socket) {
1008 _cleanup_free_ char *from = NULL, *to = NULL;
1010 _cleanup_umask_ mode_t u;
1012 struct cmsghdr cmsghdr;
1013 uint8_t buf[CMSG_SPACE(sizeof(int))];
1015 struct msghdr mh = {
1016 .msg_control = &control,
1017 .msg_controllen = sizeof(control),
1019 struct cmsghdr *cmsg;
1022 assert(kmsg_socket >= 0);
1026 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1027 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1028 * on the reading side behave very similar to /proc/kmsg,
1029 * their writing side behaves differently from /dev/kmsg in
1030 * that writing blocks when nothing is reading. In order to
1031 * avoid any problems with containers deadlocking due to this
1032 * we simply make /dev/kmsg unavailable to the container. */
1033 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1034 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1037 if (mkfifo(from, 0600) < 0) {
1038 log_error("mkfifo() for /dev/kmsg failed: %m");
1042 r = chmod_and_chown(from, 0600, 0, 0);
1044 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1048 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1049 log_error("Bind mount for /proc/kmsg failed: %m");
1053 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1055 log_error("Failed to open fifo: %m");
1059 cmsg = CMSG_FIRSTHDR(&mh);
1060 cmsg->cmsg_level = SOL_SOCKET;
1061 cmsg->cmsg_type = SCM_RIGHTS;
1062 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1063 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1065 mh.msg_controllen = cmsg->cmsg_len;
1067 /* Store away the fd in the socket, so that it stays open as
1068 * long as we run the child */
1069 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1073 log_error("Failed to send FIFO fd: %m");
1077 /* And now make the FIFO unavailable as /dev/kmsg... */
1082 static int setup_hostname(void) {
1084 if (arg_share_system)
1087 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1093 static int setup_journal(const char *directory) {
1094 sd_id128_t machine_id, this_id;
1095 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1099 p = strappend(directory, "/etc/machine-id");
1103 r = read_one_line_file(p, &b);
1104 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1107 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1112 if (isempty(id) && arg_link_journal == LINK_AUTO)
1115 /* Verify validity */
1116 r = sd_id128_from_string(id, &machine_id);
1118 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1122 r = sd_id128_get_machine(&this_id);
1124 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1128 if (sd_id128_equal(machine_id, this_id)) {
1129 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1130 "Host and machine ids are equal (%s): refusing to link journals", id);
1131 if (arg_link_journal == LINK_AUTO)
1137 if (arg_link_journal == LINK_NO)
1141 p = strappend("/var/log/journal/", id);
1142 q = strjoin(directory, "/var/log/journal/", id, NULL);
1146 if (path_is_mount_point(p, false) > 0) {
1147 if (arg_link_journal != LINK_AUTO) {
1148 log_error("%s: already a mount point, refusing to use for journal", p);
1155 if (path_is_mount_point(q, false) > 0) {
1156 if (arg_link_journal != LINK_AUTO) {
1157 log_error("%s: already a mount point, refusing to use for journal", q);
1164 r = readlink_and_make_absolute(p, &d);
1166 if ((arg_link_journal == LINK_GUEST ||
1167 arg_link_journal == LINK_AUTO) &&
1170 r = mkdir_p(q, 0755);
1172 log_warning("failed to create directory %s: %m", q);
1176 if (unlink(p) < 0) {
1177 log_error("Failed to remove symlink %s: %m", p);
1180 } else if (r == -EINVAL) {
1182 if (arg_link_journal == LINK_GUEST &&
1185 if (errno == ENOTDIR) {
1186 log_error("%s already exists and is neither a symlink nor a directory", p);
1189 log_error("Failed to remove %s: %m", p);
1193 } else if (r != -ENOENT) {
1194 log_error("readlink(%s) failed: %m", p);
1198 if (arg_link_journal == LINK_GUEST) {
1200 if (symlink(q, p) < 0) {
1201 log_error("Failed to symlink %s to %s: %m", q, p);
1205 r = mkdir_p(q, 0755);
1207 log_warning("failed to create directory %s: %m", q);
1211 if (arg_link_journal == LINK_HOST) {
1212 r = mkdir_p(p, 0755);
1214 log_error("Failed to create %s: %m", p);
1218 } else if (access(p, F_OK) < 0)
1221 if (dir_is_empty(q) == 0)
1222 log_warning("%s is not empty, proceeding anyway.", q);
1224 r = mkdir_p(q, 0755);
1226 log_error("Failed to create %s: %m", q);
1230 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1231 log_error("Failed to bind mount journal from host into guest: %m");
1238 static int setup_kdbus(const char *dest, const char *path) {
1244 p = strappenda(dest, "/dev/kdbus");
1245 if (mkdir(p, 0755) < 0) {
1246 log_error("Failed to create kdbus path: %m");
1250 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1251 log_error("Failed to mount kdbus domain path: %m");
1258 static int drop_capabilities(void) {
1259 return capability_bounding_set_drop(~arg_retain, false);
1262 static int register_machine(pid_t pid) {
1263 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1264 _cleanup_bus_unref_ sd_bus *bus = NULL;
1270 r = sd_bus_default_system(&bus);
1272 log_error("Failed to open system bus: %s", strerror(-r));
1276 if (arg_keep_unit) {
1277 r = sd_bus_call_method(
1279 "org.freedesktop.machine1",
1280 "/org/freedesktop/machine1",
1281 "org.freedesktop.machine1.Manager",
1287 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1291 strempty(arg_directory));
1293 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1295 r = sd_bus_message_new_method_call(
1298 "org.freedesktop.machine1",
1299 "/org/freedesktop/machine1",
1300 "org.freedesktop.machine1.Manager",
1303 log_error("Failed to create message: %s", strerror(-r));
1307 r = sd_bus_message_append(
1311 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1315 strempty(arg_directory));
1317 log_error("Failed to append message arguments: %s", strerror(-r));
1321 r = sd_bus_message_open_container(m, 'a', "(sv)");
1323 log_error("Failed to open container: %s", strerror(-r));
1327 if (!isempty(arg_slice)) {
1328 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1330 log_error("Failed to append slice: %s", strerror(-r));
1335 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1337 log_error("Failed to add device policy: %s", strerror(-r));
1341 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1342 /* Allow the container to
1343 * access and create the API
1344 * device nodes, so that
1345 * PrivateDevices= in the
1346 * container can work
1351 "/dev/random", "rwm",
1352 "/dev/urandom", "rwm",
1354 /* Allow the container
1355 * access to ptys. However,
1357 * container to ever create
1358 * these device nodes. */
1359 "/dev/pts/ptmx", "rw",
1361 /* Allow the container
1362 * access to all kdbus
1363 * devices. Again, the
1364 * container cannot create
1365 * these nodes, only use
1366 * them. We use a pretty
1367 * open match here, so that
1368 * the kernel API can still
1371 "char-kdbus/*", "rw");
1373 log_error("Failed to add device whitelist: %s", strerror(-r));
1377 r = sd_bus_message_close_container(m);
1379 log_error("Failed to close container: %s", strerror(-r));
1383 r = sd_bus_call(bus, m, 0, &error, NULL);
1387 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1394 static int terminate_machine(pid_t pid) {
1395 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1396 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1397 _cleanup_bus_unref_ sd_bus *bus = NULL;
1404 r = sd_bus_default_system(&bus);
1406 log_error("Failed to open system bus: %s", strerror(-r));
1410 r = sd_bus_call_method(
1412 "org.freedesktop.machine1",
1413 "/org/freedesktop/machine1",
1414 "org.freedesktop.machine1.Manager",
1421 /* Note that the machine might already have been
1422 * cleaned up automatically, hence don't consider it a
1423 * failure if we cannot get the machine object. */
1424 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1428 r = sd_bus_message_read(reply, "o", &path);
1430 return bus_log_parse_error(r);
1432 r = sd_bus_call_method(
1434 "org.freedesktop.machine1",
1436 "org.freedesktop.machine1.Machine",
1442 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1449 static int reset_audit_loginuid(void) {
1450 _cleanup_free_ char *p = NULL;
1453 if (arg_share_system)
1456 r = read_one_line_file("/proc/self/loginuid", &p);
1460 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1464 /* Already reset? */
1465 if (streq(p, "4294967295"))
1468 r = write_string_file("/proc/self/loginuid", "4294967295");
1470 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1471 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1472 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1473 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1474 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1482 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1484 static int get_mac(struct ether_addr *mac) {
1491 l = strlen(arg_machine);
1492 sz = sizeof(sd_id128_t) + l;
1495 /* fetch some persistent data unique to the host */
1496 r = sd_id128_get_machine((sd_id128_t*) v);
1500 /* combine with some data unique (on this host) to this
1501 * container instance */
1502 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1504 /* Let's hash the host machine ID plus the container name. We
1505 * use a fixed, but originally randomly created hash key here. */
1506 siphash24(result, v, sz, HASH_KEY.bytes);
1508 assert_cc(ETH_ALEN <= sizeof(result));
1509 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1511 /* see eth_random_addr in the kernel */
1512 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1513 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1518 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1519 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1520 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1521 struct ether_addr mac;
1524 if (!arg_private_network)
1527 if (!arg_network_veth)
1530 /* Use two different interface name prefixes depending whether
1531 * we are in bridge mode or not. */
1532 if (arg_network_bridge)
1533 memcpy(iface_name, "vb-", 3);
1535 memcpy(iface_name, "ve-", 3);
1536 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1540 log_error("Failed to generate predictable MAC address for host0");
1544 r = sd_rtnl_open(&rtnl, 0);
1546 log_error("Failed to connect to netlink: %s", strerror(-r));
1550 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1552 log_error("Failed to allocate netlink message: %s", strerror(-r));
1556 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1558 log_error("Failed to add netlink interface name: %s", strerror(-r));
1562 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1564 log_error("Failed to open netlink container: %s", strerror(-r));
1568 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1570 log_error("Failed to open netlink container: %s", strerror(-r));
1574 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1576 log_error("Failed to open netlink container: %s", strerror(-r));
1580 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1582 log_error("Failed to add netlink interface name: %s", strerror(-r));
1586 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1588 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1592 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1594 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1598 r = sd_rtnl_message_close_container(m);
1600 log_error("Failed to close netlink container: %s", strerror(-r));
1604 r = sd_rtnl_message_close_container(m);
1606 log_error("Failed to close netlink container: %s", strerror(-r));
1610 r = sd_rtnl_message_close_container(m);
1612 log_error("Failed to close netlink container: %s", strerror(-r));
1616 r = sd_rtnl_call(rtnl, m, 0, NULL);
1618 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1625 static int setup_bridge(const char veth_name[]) {
1626 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1627 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1630 if (!arg_private_network)
1633 if (!arg_network_veth)
1636 if (!arg_network_bridge)
1639 bridge = (int) if_nametoindex(arg_network_bridge);
1641 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1645 r = sd_rtnl_open(&rtnl, 0);
1647 log_error("Failed to connect to netlink: %s", strerror(-r));
1651 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1653 log_error("Failed to allocate netlink message: %s", strerror(-r));
1657 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1659 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1663 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1665 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1669 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1671 log_error("Failed to add netlink master field: %s", strerror(-r));
1675 r = sd_rtnl_call(rtnl, m, 0, NULL);
1677 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1684 static int parse_interface(struct udev *udev, const char *name) {
1685 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1686 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1689 ifi = (int) if_nametoindex(name);
1691 log_error("Failed to resolve interface %s: %m", name);
1695 sprintf(ifi_str, "n%i", ifi);
1696 d = udev_device_new_from_device_id(udev, ifi_str);
1698 log_error("Failed to get udev device for interface %s: %m", name);
1702 if (udev_device_get_is_initialized(d) <= 0) {
1703 log_error("Network interface %s is not initialized yet.", name);
1710 static int move_network_interfaces(pid_t pid) {
1711 _cleanup_udev_unref_ struct udev *udev = NULL;
1712 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1716 if (!arg_private_network)
1719 if (strv_isempty(arg_network_interfaces))
1722 r = sd_rtnl_open(&rtnl, 0);
1724 log_error("Failed to connect to netlink: %s", strerror(-r));
1730 log_error("Failed to connect to udev.");
1734 STRV_FOREACH(i, arg_network_interfaces) {
1735 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1738 ifi = parse_interface(udev, *i);
1742 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1744 log_error("Failed to allocate netlink message: %s", strerror(-r));
1748 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1750 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1754 r = sd_rtnl_call(rtnl, m, 0, NULL);
1756 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1764 static int setup_macvlan(pid_t pid) {
1765 _cleanup_udev_unref_ struct udev *udev = NULL;
1766 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1770 if (!arg_private_network)
1773 if (strv_isempty(arg_network_macvlan))
1776 r = sd_rtnl_open(&rtnl, 0);
1778 log_error("Failed to connect to netlink: %s", strerror(-r));
1784 log_error("Failed to connect to udev.");
1788 STRV_FOREACH(i, arg_network_macvlan) {
1789 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1790 _cleanup_free_ char *n = NULL;
1793 ifi = parse_interface(udev, *i);
1797 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1799 log_error("Failed to allocate netlink message: %s", strerror(-r));
1803 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1805 log_error("Failed to add netlink interface index: %s", strerror(-r));
1809 n = strappend("mv-", *i);
1813 strshorten(n, IFNAMSIZ-1);
1815 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1817 log_error("Failed to add netlink interface name: %s", strerror(-r));
1821 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1823 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1827 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1829 log_error("Failed to open netlink container: %s", strerror(-r));
1833 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1835 log_error("Failed to open netlink container: %s", strerror(-r));
1839 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1841 log_error("Failed to append macvlan mode: %s", strerror(-r));
1845 r = sd_rtnl_message_close_container(m);
1847 log_error("Failed to close netlink container: %s", strerror(-r));
1851 r = sd_rtnl_message_close_container(m);
1853 log_error("Failed to close netlink container: %s", strerror(-r));
1857 r = sd_rtnl_call(rtnl, m, 0, NULL);
1859 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1867 static int setup_seccomp(void) {
1870 static const int blacklist[] = {
1871 SCMP_SYS(kexec_load),
1872 SCMP_SYS(open_by_handle_at),
1873 SCMP_SYS(init_module),
1874 SCMP_SYS(finit_module),
1875 SCMP_SYS(delete_module),
1882 scmp_filter_ctx seccomp;
1886 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1890 r = seccomp_add_secondary_archs(seccomp);
1892 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1896 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1897 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1899 continue; /* unknown syscall */
1901 log_error("Failed to block syscall: %s", strerror(-r));
1907 Audit is broken in containers, much of the userspace audit
1908 hookup will fail if running inside a container. We don't
1909 care and just turn off creation of audit sockets.
1911 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1912 with EAFNOSUPPORT which audit userspace uses as indication
1913 that audit is disabled in the kernel.
1916 r = seccomp_rule_add(
1918 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1921 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1922 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1924 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1928 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1930 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1934 r = seccomp_load(seccomp);
1936 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1939 seccomp_release(seccomp);
1947 static int setup_image(char **device_path, int *loop_nr) {
1948 struct loop_info64 info = {
1949 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1951 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1952 _cleanup_free_ char* loopdev = NULL;
1956 assert(device_path);
1959 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1961 log_error("Failed to open %s: %m", arg_image);
1965 if (fstat(fd, &st) < 0) {
1966 log_error("Failed to stat %s: %m", arg_image);
1970 if (S_ISBLK(st.st_mode)) {
1973 p = strdup(arg_image);
1987 if (!S_ISREG(st.st_mode)) {
1988 log_error("%s is not a regular file or block device: %m", arg_image);
1992 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1994 log_error("Failed to open /dev/loop-control: %m");
1998 nr = ioctl(control, LOOP_CTL_GET_FREE);
2000 log_error("Failed to allocate loop device: %m");
2004 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2007 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2009 log_error("Failed to open loop device %s: %m", loopdev);
2013 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2014 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2019 info.lo_flags |= LO_FLAGS_READ_ONLY;
2021 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2022 log_error("Failed to set loopback settings on %s: %m", loopdev);
2026 *device_path = loopdev;
2037 static int dissect_image(
2039 char **root_device, bool *root_device_rw,
2040 char **home_device, bool *home_device_rw,
2041 char **srv_device, bool *srv_device_rw,
2045 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2046 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2047 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2048 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2049 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2050 _cleanup_udev_unref_ struct udev *udev = NULL;
2051 struct udev_list_entry *first, *item;
2052 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2053 const char *pttype = NULL;
2059 assert(root_device);
2060 assert(home_device);
2064 b = blkid_new_probe();
2069 r = blkid_probe_set_device(b, fd, 0, 0);
2074 log_error("Failed to set device on blkid probe: %m");
2078 blkid_probe_enable_partitions(b, 1);
2079 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2082 r = blkid_do_safeprobe(b);
2083 if (r == -2 || r == 1) {
2084 log_error("Failed to identify any partition table on %s.\n"
2085 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2087 } else if (r != 0) {
2090 log_error("Failed to probe: %m");
2094 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2095 if (!streq_ptr(pttype, "gpt")) {
2096 log_error("Image %s does not carry a GUID Partition Table.\n"
2097 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2102 pl = blkid_probe_get_partitions(b);
2107 log_error("Failed to list partitions of %s", arg_image);
2115 if (fstat(fd, &st) < 0) {
2116 log_error("Failed to stat block device: %m");
2120 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2124 e = udev_enumerate_new(udev);
2128 r = udev_enumerate_add_match_parent(e, d);
2132 r = udev_enumerate_scan_devices(e);
2134 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2138 first = udev_enumerate_get_list_entry(e);
2139 udev_list_entry_foreach(item, first) {
2140 _cleanup_udev_device_unref_ struct udev_device *q;
2141 const char *stype, *node;
2142 unsigned long long flags;
2149 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2154 log_error("Failed to get partition device of %s: %m", arg_image);
2158 qn = udev_device_get_devnum(q);
2162 if (st.st_rdev == qn)
2165 node = udev_device_get_devnode(q);
2169 pp = blkid_partlist_devno_to_partition(pl, qn);
2173 flags = blkid_partition_get_flags(pp);
2174 if (flags & GPT_FLAG_NO_AUTO)
2177 nr = blkid_partition_get_partno(pp);
2181 stype = blkid_partition_get_type_string(pp);
2185 if (sd_id128_from_string(stype, &type_id) < 0)
2188 if (sd_id128_equal(type_id, GPT_HOME)) {
2190 if (home && nr >= home_nr)
2194 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2197 home = strdup(node);
2200 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2202 if (srv && nr >= srv_nr)
2206 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2213 #ifdef GPT_ROOT_NATIVE
2214 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2216 if (root && nr >= root_nr)
2220 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2223 root = strdup(node);
2228 #ifdef GPT_ROOT_SECONDARY
2229 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2231 if (secondary_root && nr >= secondary_root_nr)
2234 secondary_root_nr = nr;
2235 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2238 free(secondary_root);
2239 secondary_root = strdup(node);
2240 if (!secondary_root)
2246 if (!root && !secondary_root) {
2247 log_error("Failed to identify root partition in disk image %s.\n"
2248 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2253 *root_device = root;
2256 *root_device_rw = root_rw;
2258 } else if (secondary_root) {
2259 *root_device = secondary_root;
2260 secondary_root = NULL;
2262 *root_device_rw = secondary_root_rw;
2267 *home_device = home;
2270 *home_device_rw = home_rw;
2277 *srv_device_rw = srv_rw;
2282 log_error("--image= is not supported, compiled without blkid support.");
2287 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2289 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2290 const char *fstype, *p;
2300 p = strappenda(where, directory);
2305 b = blkid_new_probe_from_filename(what);
2309 log_error("Failed to allocate prober for %s: %m", what);
2313 blkid_probe_enable_superblocks(b, 1);
2314 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2317 r = blkid_do_safeprobe(b);
2318 if (r == -1 || r == 1) {
2319 log_error("Cannot determine file system type of %s", what);
2321 } else if (r != 0) {
2324 log_error("Failed to probe %s: %m", what);
2329 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2332 log_error("Failed to determine file system type of %s", what);
2336 if (streq(fstype, "crypto_LUKS")) {
2337 log_error("nspawn currently does not support LUKS disk images.");
2341 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2342 log_error("Failed to mount %s: %m", what);
2348 log_error("--image= is not supported, compiled without blkid support.");
2353 static int mount_devices(
2355 const char *root_device, bool root_device_rw,
2356 const char *home_device, bool home_device_rw,
2357 const char *srv_device, bool srv_device_rw) {
2363 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2365 log_error("Failed to mount root directory: %s", strerror(-r));
2371 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2373 log_error("Failed to mount home directory: %s", strerror(-r));
2379 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2381 log_error("Failed to mount server data directory: %s", strerror(-r));
2389 static void loop_remove(int nr, int *image_fd) {
2390 _cleanup_close_ int control = -1;
2395 if (image_fd && *image_fd >= 0) {
2396 ioctl(*image_fd, LOOP_CLR_FD);
2397 *image_fd = safe_close(*image_fd);
2400 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2404 ioctl(control, LOOP_CTL_REMOVE, nr);
2407 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2415 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2416 log_error("Failed to allocate pipe: %m");
2422 log_error("Failed to fork getent child: %m");
2424 } else if (pid == 0) {
2426 char *empty_env = NULL;
2428 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2429 _exit(EXIT_FAILURE);
2431 if (pipe_fds[0] > 2)
2432 safe_close(pipe_fds[0]);
2433 if (pipe_fds[1] > 2)
2434 safe_close(pipe_fds[1]);
2436 nullfd = open("/dev/null", O_RDWR);
2438 _exit(EXIT_FAILURE);
2440 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2441 _exit(EXIT_FAILURE);
2443 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2444 _exit(EXIT_FAILURE);
2449 reset_all_signal_handlers();
2450 close_all_fds(NULL, 0);
2452 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2453 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2454 _exit(EXIT_FAILURE);
2457 pipe_fds[1] = safe_close(pipe_fds[1]);
2464 static int change_uid_gid(char **_home) {
2465 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2466 _cleanup_free_ uid_t *uids = NULL;
2467 _cleanup_free_ char *home = NULL;
2468 _cleanup_fclose_ FILE *f = NULL;
2469 _cleanup_close_ int fd = -1;
2470 unsigned n_uids = 0;
2479 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2480 /* Reset everything fully to 0, just in case */
2482 if (setgroups(0, NULL) < 0) {
2483 log_error("setgroups() failed: %m");
2487 if (setresgid(0, 0, 0) < 0) {
2488 log_error("setregid() failed: %m");
2492 if (setresuid(0, 0, 0) < 0) {
2493 log_error("setreuid() failed: %m");
2501 /* First, get user credentials */
2502 fd = spawn_getent("passwd", arg_user, &pid);
2506 f = fdopen(fd, "r");
2511 if (!fgets(line, sizeof(line), f)) {
2514 log_error("Failed to resolve user %s.", arg_user);
2518 log_error("Failed to read from getent: %m");
2524 wait_for_terminate_and_warn("getent passwd", pid);
2526 x = strchr(line, ':');
2528 log_error("/etc/passwd entry has invalid user field.");
2532 u = strchr(x+1, ':');
2534 log_error("/etc/passwd entry has invalid password field.");
2541 log_error("/etc/passwd entry has invalid UID field.");
2549 log_error("/etc/passwd entry has invalid GID field.");
2554 h = strchr(x+1, ':');
2556 log_error("/etc/passwd entry has invalid GECOS field.");
2563 log_error("/etc/passwd entry has invalid home directory field.");
2569 r = parse_uid(u, &uid);
2571 log_error("Failed to parse UID of user.");
2575 r = parse_gid(g, &gid);
2577 log_error("Failed to parse GID of user.");
2585 /* Second, get group memberships */
2586 fd = spawn_getent("initgroups", arg_user, &pid);
2591 f = fdopen(fd, "r");
2596 if (!fgets(line, sizeof(line), f)) {
2598 log_error("Failed to resolve user %s.", arg_user);
2602 log_error("Failed to read from getent: %m");
2608 wait_for_terminate_and_warn("getent initgroups", pid);
2610 /* Skip over the username and subsequent separator whitespace */
2612 x += strcspn(x, WHITESPACE);
2613 x += strspn(x, WHITESPACE);
2615 FOREACH_WORD(w, l, x, state) {
2621 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2624 r = parse_uid(c, &uids[n_uids++]);
2626 log_error("Failed to parse group data from getent.");
2631 r = mkdir_parents(home, 0775);
2633 log_error("Failed to make home root directory: %s", strerror(-r));
2637 r = mkdir_safe(home, 0755, uid, gid);
2638 if (r < 0 && r != -EEXIST) {
2639 log_error("Failed to make home directory: %s", strerror(-r));
2643 fchown(STDIN_FILENO, uid, gid);
2644 fchown(STDOUT_FILENO, uid, gid);
2645 fchown(STDERR_FILENO, uid, gid);
2647 if (setgroups(n_uids, uids) < 0) {
2648 log_error("Failed to set auxiliary groups: %m");
2652 if (setresgid(gid, gid, gid) < 0) {
2653 log_error("setregid() failed: %m");
2657 if (setresuid(uid, uid, uid) < 0) {
2658 log_error("setreuid() failed: %m");
2672 * < 0 : wait_for_terminate() failed to get the state of the
2673 * container, the container was terminated by a signal, or
2674 * failed for an unknown reason. No change is made to the
2675 * container argument.
2676 * > 0 : The program executed in the container terminated with an
2677 * error. The exit code of the program executed in the
2678 * container is returned. No change is made to the container
2680 * 0 : The container is being rebooted, has been shut down or exited
2681 * successfully. The container argument has been set to either
2682 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2684 * That is, success is indicated by a return value of zero, and an
2685 * error is indicated by a non-zero value.
2687 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2691 r = wait_for_terminate(pid, &status);
2693 log_warning("Failed to wait for container: %s", strerror(-r));
2697 switch (status.si_code) {
2699 r = status.si_status;
2702 log_debug("Container %s exited successfully.",
2705 *container = CONTAINER_TERMINATED;
2707 log_error("Container %s failed with error code %i.",
2708 arg_machine, status.si_status);
2713 if (status.si_status == SIGINT) {
2715 log_info("Container %s has been shut down.",
2718 *container = CONTAINER_TERMINATED;
2721 } else if (status.si_status == SIGHUP) {
2723 log_info("Container %s is being rebooted.",
2726 *container = CONTAINER_REBOOTED;
2730 /* CLD_KILLED fallthrough */
2733 log_error("Container %s terminated by signal %s.",
2734 arg_machine, signal_to_string(status.si_status));
2739 log_error("Container %s failed due to unknown reason.",
2748 static void nop_handler(int sig) {}
2750 int main(int argc, char *argv[]) {
2752 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2753 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2754 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2755 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2756 _cleanup_fdset_free_ FDSet *fds = NULL;
2757 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2758 const char *console = NULL;
2759 char veth_name[IFNAMSIZ];
2760 bool secondary = false;
2761 sigset_t mask, mask_chld;
2764 log_parse_environment();
2767 k = parse_argv(argc, argv);
2776 if (arg_directory) {
2779 p = path_make_absolute_cwd(arg_directory);
2780 free(arg_directory);
2783 arg_directory = get_current_dir_name();
2785 if (!arg_directory) {
2786 log_error("Failed to determine path, please use -D.");
2789 path_kill_slashes(arg_directory);
2793 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2799 hostname_cleanup(arg_machine, false);
2800 if (isempty(arg_machine)) {
2801 log_error("Failed to determine machine name automatically, please use -M.");
2806 if (geteuid() != 0) {
2807 log_error("Need to be root.");
2811 if (sd_booted() <= 0) {
2812 log_error("Not running on a systemd system.");
2817 n_fd_passed = sd_listen_fds(false);
2818 if (n_fd_passed > 0) {
2819 k = fdset_new_listen_fds(&fds, false);
2821 log_error("Failed to collect file descriptors: %s", strerror(-k));
2825 fdset_close_others(fds);
2828 if (arg_directory) {
2829 if (path_equal(arg_directory, "/")) {
2830 log_error("Spawning container on root directory not supported.");
2835 if (path_is_os_tree(arg_directory) <= 0) {
2836 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2842 p = strappenda(arg_directory,
2843 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2844 if (access(p, F_OK) < 0) {
2845 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2851 char template[] = "/tmp/nspawn-root-XXXXXX";
2853 if (!mkdtemp(template)) {
2854 log_error("Failed to create temporary directory: %m");
2859 arg_directory = strdup(template);
2860 if (!arg_directory) {
2865 image_fd = setup_image(&device_path, &loop_nr);
2871 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2876 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2878 log_error("Failed to acquire pseudo tty: %m");
2882 console = ptsname(master);
2884 log_error("Failed to determine tty name: %m");
2889 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2890 arg_machine, arg_image ? arg_image : arg_directory);
2892 if (unlockpt(master) < 0) {
2893 log_error("Failed to unlock tty: %m");
2897 if (access("/dev/kdbus/control", F_OK) >= 0) {
2899 if (arg_share_system) {
2900 kdbus_domain = strdup("/dev/kdbus");
2901 if (!kdbus_domain) {
2908 ns = strappenda("machine-", arg_machine);
2909 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2911 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2913 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2917 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2918 log_error("Failed to create kmsg socket pair: %m");
2922 sd_notify(0, "READY=1");
2924 assert_se(sigemptyset(&mask) == 0);
2925 assert_se(sigemptyset(&mask_chld) == 0);
2926 sigaddset(&mask_chld, SIGCHLD);
2927 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2928 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2931 ContainerStatus container_status;
2932 int eventfds[2] = { -1, -1 };
2933 struct sigaction sa = {
2934 .sa_handler = nop_handler,
2935 .sa_flags = SA_NOCLDSTOP,
2938 /* Child can be killed before execv(), so handle SIGCHLD
2939 * in order to interrupt parent's blocking calls and
2940 * give it a chance to call wait() and terminate. */
2941 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2943 log_error("Failed to change the signal mask: %m");
2947 r = sigaction(SIGCHLD, &sa, NULL);
2949 log_error("Failed to install SIGCHLD handler: %m");
2953 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2954 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2955 (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2957 if (errno == EINVAL)
2958 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2960 log_error("clone() failed: %m");
2968 _cleanup_free_ char *home = NULL;
2970 const char *envp[] = {
2971 "PATH=" DEFAULT_PATH_SPLIT_USR,
2972 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2977 NULL, /* container_uuid */
2978 NULL, /* LISTEN_FDS */
2979 NULL, /* LISTEN_PID */
2984 envp[n_env] = strv_find_prefix(environ, "TERM=");
2988 master = safe_close(master);
2990 close_nointr(STDIN_FILENO);
2991 close_nointr(STDOUT_FILENO);
2992 close_nointr(STDERR_FILENO);
2994 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2996 reset_all_signal_handlers();
2998 assert_se(sigemptyset(&mask) == 0);
2999 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
3001 k = open_terminal(console, O_RDWR);
3002 if (k != STDIN_FILENO) {
3008 log_error("Failed to open console: %s", strerror(-k));
3012 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3013 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3014 log_error("Failed to duplicate console: %m");
3019 log_error("setsid() failed: %m");
3023 if (reset_audit_loginuid() < 0)
3026 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3027 log_error("PR_SET_PDEATHSIG failed: %m");
3031 /* Mark everything as slave, so that we still
3032 * receive mounts from the real root, but don't
3033 * propagate mounts to the real root. */
3034 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3035 log_error("MS_SLAVE|MS_REC failed: %m");
3039 if (mount_devices(arg_directory,
3040 root_device, root_device_rw,
3041 home_device, home_device_rw,
3042 srv_device, srv_device_rw) < 0)
3045 r = base_filesystem_create(arg_directory);
3047 log_error("Failed to create the base filesystem: %s", strerror(-r));
3051 /* Turn directory into bind mount */
3052 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3053 log_error("Failed to make bind mount: %m");
3057 if (arg_read_only) {
3058 k = bind_remount_recursive(arg_directory, true);
3060 log_error("Failed to make tree read-only: %s", strerror(-k));
3065 if (mount_all(arg_directory) < 0)
3068 if (copy_devnodes(arg_directory) < 0)
3071 if (setup_ptmx(arg_directory) < 0)
3074 dev_setup(arg_directory);
3076 if (setup_seccomp() < 0)
3079 if (setup_dev_console(arg_directory, console) < 0)
3082 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3085 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3087 if (setup_boot_id(arg_directory) < 0)
3090 if (setup_timezone(arg_directory) < 0)
3093 if (setup_resolv_conf(arg_directory) < 0)
3096 if (setup_journal(arg_directory) < 0)
3099 if (mount_binds(arg_directory, arg_bind, false) < 0)
3102 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3105 if (mount_tmpfs(arg_directory) < 0)
3108 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3111 /* Tell the parent that we are ready, and that
3112 * it can cgroupify us to that we lack access
3113 * to certain devices and resources. */
3114 r = eventfd_send_state(eventfds[1],
3115 EVENTFD_CHILD_SUCCEEDED);
3116 eventfds[1] = safe_close(eventfds[1]);
3120 if (chdir(arg_directory) < 0) {
3121 log_error("chdir(%s) failed: %m", arg_directory);
3125 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3126 log_error("mount(MS_MOVE) failed: %m");
3130 if (chroot(".") < 0) {
3131 log_error("chroot() failed: %m");
3135 if (chdir("/") < 0) {
3136 log_error("chdir() failed: %m");
3142 if (arg_private_network)
3145 if (drop_capabilities() < 0) {
3146 log_error("drop_capabilities() failed: %m");
3150 r = change_uid_gid(&home);
3154 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3155 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3156 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3161 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3164 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3170 if (fdset_size(fds) > 0) {
3171 k = fdset_cloexec(fds, false);
3173 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3177 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3178 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3186 if (arg_personality != 0xffffffffLU) {
3187 if (personality(arg_personality) < 0) {
3188 log_error("personality() failed: %m");
3191 } else if (secondary) {
3192 if (personality(PER_LINUX32) < 0) {
3193 log_error("personality() failed: %m");
3199 if (arg_selinux_context)
3200 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3201 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3206 if (!strv_isempty(arg_setenv)) {
3209 n = strv_env_merge(2, envp, arg_setenv);
3217 env_use = (char**) envp;
3219 /* Wait until the parent is ready with the setup, too... */
3220 r = eventfd_parent_succeeded(eventfds[0]);
3221 eventfds[0] = safe_close(eventfds[0]);
3229 /* Automatically search for the init system */
3231 l = 1 + argc - optind;
3232 a = newa(char*, l + 1);
3233 memcpy(a + 1, argv + optind, l * sizeof(char*));
3235 a[0] = (char*) "/usr/lib/systemd/systemd";
3236 execve(a[0], a, env_use);
3238 a[0] = (char*) "/lib/systemd/systemd";
3239 execve(a[0], a, env_use);
3241 a[0] = (char*) "/sbin/init";
3242 execve(a[0], a, env_use);
3243 } else if (argc > optind)
3244 execvpe(argv[optind], argv + optind, env_use);
3246 chdir(home ? home : "/root");
3247 execle("/bin/bash", "-bash", NULL, env_use);
3248 execle("/bin/sh", "-sh", NULL, env_use);
3251 log_error("execv() failed: %m");
3254 /* Tell the parent that the setup failed, so he
3255 * can clean up resources and terminate. */
3256 if (eventfds[1] != -1)
3257 eventfd_send_state(eventfds[1],
3258 EVENTFD_CHILD_FAILED);
3259 _exit(EXIT_FAILURE);
3265 /* Wait for the child event:
3266 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3267 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3268 * it is ready with all it needs to do with priviliges.
3269 * After we got the notification we can make the process
3270 * join its cgroup which might limit what it can do */
3271 r = eventfd_child_succeeded(eventfds[1]);
3272 eventfds[1] = safe_close(eventfds[1]);
3275 r = register_machine(pid);
3279 r = move_network_interfaces(pid);
3283 r = setup_veth(pid, veth_name);
3287 r = setup_bridge(veth_name);
3291 r = setup_macvlan(pid);
3295 /* Block SIGCHLD here, before notifying child.
3296 * process_pty() will handle it with the other signals. */
3297 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3301 /* Reset signal to default */
3302 r = default_signals(SIGCHLD, -1);
3306 /* Notify the child that the parent is ready with all
3307 * its setup, and that the child can now hand over
3308 * control to the code to run inside the container. */
3309 r = eventfd_send_state(eventfds[0], EVENTFD_PARENT_SUCCEEDED);
3310 eventfds[0] = safe_close(eventfds[0]);
3314 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3323 /* Kill if it is not dead yet anyway */
3324 terminate_machine(pid);
3327 /* Normally redundant, but better safe than sorry */
3330 r = wait_for_container(pid, &container_status);
3334 /* We failed to wait for the container, or the
3335 * container exited abnormally */
3338 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3339 /* The container exited with a non-zero
3340 * status, or with zero status and no reboot
3344 /* CONTAINER_REBOOTED, loop again */
3348 loop_remove(loop_nr, &image_fd);
3353 free(arg_directory);
3356 strv_free(arg_setenv);
3357 strv_free(arg_network_interfaces);
3358 strv_free(arg_network_macvlan);
3359 strv_free(arg_bind);
3360 strv_free(arg_bind_ro);
3361 strv_free(arg_tmpfs);