1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
91 #include "seccomp-util.h"
94 typedef enum LinkJournal {
101 static char *arg_directory = NULL;
102 static char *arg_user = NULL;
103 static sd_id128_t arg_uuid = {};
104 static char *arg_machine = NULL;
105 static const char *arg_selinux_context = NULL;
106 static const char *arg_selinux_apifs_context = NULL;
107 static const char *arg_slice = NULL;
108 static bool arg_private_network = false;
109 static bool arg_read_only = false;
110 static bool arg_boot = false;
111 static LinkJournal arg_link_journal = LINK_AUTO;
112 static uint64_t arg_retain =
113 (1ULL << CAP_CHOWN) |
114 (1ULL << CAP_DAC_OVERRIDE) |
115 (1ULL << CAP_DAC_READ_SEARCH) |
116 (1ULL << CAP_FOWNER) |
117 (1ULL << CAP_FSETID) |
118 (1ULL << CAP_IPC_OWNER) |
120 (1ULL << CAP_LEASE) |
121 (1ULL << CAP_LINUX_IMMUTABLE) |
122 (1ULL << CAP_NET_BIND_SERVICE) |
123 (1ULL << CAP_NET_BROADCAST) |
124 (1ULL << CAP_NET_RAW) |
125 (1ULL << CAP_SETGID) |
126 (1ULL << CAP_SETFCAP) |
127 (1ULL << CAP_SETPCAP) |
128 (1ULL << CAP_SETUID) |
129 (1ULL << CAP_SYS_ADMIN) |
130 (1ULL << CAP_SYS_CHROOT) |
131 (1ULL << CAP_SYS_NICE) |
132 (1ULL << CAP_SYS_PTRACE) |
133 (1ULL << CAP_SYS_TTY_CONFIG) |
134 (1ULL << CAP_SYS_RESOURCE) |
135 (1ULL << CAP_SYS_BOOT) |
136 (1ULL << CAP_AUDIT_WRITE) |
137 (1ULL << CAP_AUDIT_CONTROL) |
139 static char **arg_bind = NULL;
140 static char **arg_bind_ro = NULL;
141 static char **arg_setenv = NULL;
142 static bool arg_quiet = false;
143 static bool arg_share_system = false;
144 static bool arg_register = true;
145 static bool arg_keep_unit = false;
146 static char **arg_network_interfaces = NULL;
147 static char **arg_network_macvlan = NULL;
148 static bool arg_network_veth = false;
149 static const char *arg_network_bridge = NULL;
150 static unsigned long arg_personality = 0xffffffffLU;
151 static const char *arg_image = NULL;
153 static int help(void) {
155 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
156 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
157 " -h --help Show this help\n"
158 " --version Print version string\n"
159 " -q --quiet Do not show status information\n"
160 " -D --directory=PATH Root directory for the container\n"
161 " -i --image=PATH File system device or image for the container\n"
162 " -b --boot Boot up full system (i.e. invoke init)\n"
163 " -u --user=USER Run the command under specified user or uid\n"
164 " -M --machine=NAME Set the machine name for the container\n"
165 " --uuid=UUID Set a specific machine UUID for the container\n"
166 " -S --slice=SLICE Place the container in the specified slice\n"
167 " --private-network Disable network in container\n"
168 " --network-interface=INTERFACE\n"
169 " Assign an existing network interface to the\n"
171 " --network-macvlan=INTERFACE\n"
172 " Create a macvlan network interface based on an\n"
173 " existing network interface to the container\n"
174 " --network-veth Add a virtual ethernet connection between host\n"
176 " --network-bridge=INTERFACE\n"
177 " Add a virtual ethernet connection between host\n"
178 " and container and add it to an existing bridge on\n"
180 " -Z --selinux-context=SECLABEL\n"
181 " Set the SELinux security context to be used by\n"
182 " processes in the container\n"
183 " -L --selinux-apifs-context=SECLABEL\n"
184 " Set the SELinux security context to be used by\n"
185 " API/tmpfs file systems in the container\n"
186 " --capability=CAP In addition to the default, retain specified\n"
188 " --drop-capability=CAP Drop the specified capability from the default set\n"
189 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
190 " -j Equivalent to --link-journal=host\n"
191 " --read-only Mount the root directory read-only\n"
192 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
194 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
195 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
196 " --share-system Share system namespaces with host\n"
197 " --register=BOOLEAN Register container as machine\n"
198 " --keep-unit Do not register a scope for the machine, reuse\n"
199 " the service unit nspawn is running in\n",
200 program_invocation_short_name);
205 static int parse_argv(int argc, char *argv[]) {
221 ARG_NETWORK_INTERFACE,
228 static const struct option options[] = {
229 { "help", no_argument, NULL, 'h' },
230 { "version", no_argument, NULL, ARG_VERSION },
231 { "directory", required_argument, NULL, 'D' },
232 { "user", required_argument, NULL, 'u' },
233 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
234 { "boot", no_argument, NULL, 'b' },
235 { "uuid", required_argument, NULL, ARG_UUID },
236 { "read-only", no_argument, NULL, ARG_READ_ONLY },
237 { "capability", required_argument, NULL, ARG_CAPABILITY },
238 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
239 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
240 { "bind", required_argument, NULL, ARG_BIND },
241 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
242 { "machine", required_argument, NULL, 'M' },
243 { "slice", required_argument, NULL, 'S' },
244 { "setenv", required_argument, NULL, ARG_SETENV },
245 { "selinux-context", required_argument, NULL, 'Z' },
246 { "selinux-apifs-context", required_argument, NULL, 'L' },
247 { "quiet", no_argument, NULL, 'q' },
248 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
249 { "register", required_argument, NULL, ARG_REGISTER },
250 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
251 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
252 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
253 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
254 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
255 { "personality", required_argument, NULL, ARG_PERSONALITY },
256 { "image", required_argument, NULL, 'i' },
261 uint64_t plus = 0, minus = 0;
266 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
274 puts(PACKAGE_STRING);
275 puts(SYSTEMD_FEATURES);
280 arg_directory = canonicalize_file_name(optarg);
281 if (!arg_directory) {
282 log_error("Invalid root directory: %m");
294 arg_user = strdup(optarg);
300 case ARG_NETWORK_BRIDGE:
301 arg_network_bridge = optarg;
305 case ARG_NETWORK_VETH:
306 arg_network_veth = true;
307 arg_private_network = true;
310 case ARG_NETWORK_INTERFACE:
311 if (strv_extend(&arg_network_interfaces, optarg) < 0)
314 arg_private_network = true;
317 case ARG_NETWORK_MACVLAN:
318 if (strv_extend(&arg_network_macvlan, optarg) < 0)
323 case ARG_PRIVATE_NETWORK:
324 arg_private_network = true;
332 r = sd_id128_from_string(optarg, &arg_uuid);
334 log_error("Invalid UUID: %s", optarg);
344 if (isempty(optarg)) {
349 if (!hostname_is_valid(optarg)) {
350 log_error("Invalid machine name: %s", optarg);
355 arg_machine = strdup(optarg);
363 arg_selinux_context = optarg;
367 arg_selinux_apifs_context = optarg;
371 arg_read_only = true;
375 case ARG_DROP_CAPABILITY: {
379 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
380 _cleanup_free_ char *t;
383 t = strndup(word, length);
387 if (streq(t, "all")) {
388 if (c == ARG_CAPABILITY)
389 plus = (uint64_t) -1;
391 minus = (uint64_t) -1;
393 if (cap_from_name(t, &cap) < 0) {
394 log_error("Failed to parse capability %s.", t);
398 if (c == ARG_CAPABILITY)
399 plus |= 1ULL << (uint64_t) cap;
401 minus |= 1ULL << (uint64_t) cap;
409 arg_link_journal = LINK_GUEST;
412 case ARG_LINK_JOURNAL:
413 if (streq(optarg, "auto"))
414 arg_link_journal = LINK_AUTO;
415 else if (streq(optarg, "no"))
416 arg_link_journal = LINK_NO;
417 else if (streq(optarg, "guest"))
418 arg_link_journal = LINK_GUEST;
419 else if (streq(optarg, "host"))
420 arg_link_journal = LINK_HOST;
422 log_error("Failed to parse link journal mode %s", optarg);
430 _cleanup_free_ char *a = NULL, *b = NULL;
434 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
436 e = strchr(optarg, ':');
438 a = strndup(optarg, e - optarg);
448 if (!path_is_absolute(a) || !path_is_absolute(b)) {
449 log_error("Invalid bind mount specification: %s", optarg);
453 r = strv_extend(x, a);
457 r = strv_extend(x, b);
467 if (!env_assignment_is_valid(optarg)) {
468 log_error("Environment variable assignment '%s' is not valid.", optarg);
472 n = strv_env_set(arg_setenv, optarg);
476 strv_free(arg_setenv);
485 case ARG_SHARE_SYSTEM:
486 arg_share_system = true;
490 r = parse_boolean(optarg);
492 log_error("Failed to parse --register= argument: %s", optarg);
500 arg_keep_unit = true;
503 case ARG_PERSONALITY:
505 arg_personality = personality_from_string(optarg);
506 if (arg_personality == 0xffffffffLU) {
507 log_error("Unknown or unsupported personality '%s'.", optarg);
517 assert_not_reached("Unhandled option");
521 if (arg_share_system)
522 arg_register = false;
524 if (arg_boot && arg_share_system) {
525 log_error("--boot and --share-system may not be combined.");
529 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
530 log_error("--keep-unit may not be used when invoked from a user session.");
534 if (arg_directory && arg_image) {
535 log_error("--directory= and --image= may not be combined.");
539 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
544 static int mount_all(const char *dest) {
546 typedef struct MountPoint {
555 static const MountPoint mount_table[] = {
556 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
557 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
558 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
559 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
560 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
561 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
562 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
563 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
565 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
566 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
573 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
574 _cleanup_free_ char *where = NULL;
576 _cleanup_free_ char *options = NULL;
581 where = strjoin(dest, "/", mount_table[k].where, NULL);
585 t = path_is_mount_point(where, true);
587 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
595 /* Skip this entry if it is not a remount. */
596 if (mount_table[k].what && t > 0)
599 mkdir_p(where, 0755);
602 if (arg_selinux_apifs_context &&
603 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
604 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
611 o = mount_table[k].options;
614 if (mount(mount_table[k].what,
617 mount_table[k].flags,
619 mount_table[k].fatal) {
621 log_error("mount(%s) failed: %m", where);
631 static int mount_binds(const char *dest, char **l, unsigned long flags) {
634 STRV_FOREACH_PAIR(x, y, l) {
636 struct stat source_st, dest_st;
639 if (stat(*x, &source_st) < 0) {
640 log_error("Failed to stat %s: %m", *x);
644 where = strappenda(dest, *y);
645 r = stat(where, &dest_st);
647 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
648 log_error("The file types of %s and %s do not match. Refusing bind mount",
652 } else if (errno == ENOENT) {
653 r = mkdir_parents_label(where, 0755);
655 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
659 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
662 /* Create the mount point, but be conservative -- refuse to create block
663 * and char devices. */
664 if (S_ISDIR(source_st.st_mode))
665 mkdir_label(where, 0755);
666 else if (S_ISFIFO(source_st.st_mode))
668 else if (S_ISSOCK(source_st.st_mode))
669 mknod(where, 0644 | S_IFSOCK, 0);
670 else if (S_ISREG(source_st.st_mode))
673 log_error("Refusing to create mountpoint for file: %s", *x);
677 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
678 log_error("mount(%s) failed: %m", where);
682 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
683 log_error("mount(%s) failed: %m", where);
691 static int setup_timezone(const char *dest) {
692 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
698 /* Fix the timezone, if possible */
699 r = readlink_malloc("/etc/localtime", &p);
701 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
705 z = path_startswith(p, "../usr/share/zoneinfo/");
707 z = path_startswith(p, "/usr/share/zoneinfo/");
709 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
713 where = strappend(dest, "/etc/localtime");
717 r = readlink_malloc(where, &q);
719 y = path_startswith(q, "../usr/share/zoneinfo/");
721 y = path_startswith(q, "/usr/share/zoneinfo/");
724 /* Already pointing to the right place? Then do nothing .. */
725 if (y && streq(y, z))
729 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
733 if (access(check, F_OK) < 0) {
734 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
738 what = strappend("../usr/share/zoneinfo/", z);
743 if (symlink(what, where) < 0) {
744 log_error("Failed to correct timezone of container: %m");
751 static int setup_resolv_conf(const char *dest) {
752 char _cleanup_free_ *where = NULL;
756 if (arg_private_network)
759 /* Fix resolv.conf, if possible */
760 where = strappend(dest, "/etc/resolv.conf");
764 /* We don't really care for the results of this really. If it
765 * fails, it fails, but meh... */
766 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
771 static int setup_boot_id(const char *dest) {
772 _cleanup_free_ char *from = NULL, *to = NULL;
779 if (arg_share_system)
782 /* Generate a new randomized boot ID, so that each boot-up of
783 * the container gets a new one */
785 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
786 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
790 r = sd_id128_randomize(&rnd);
792 log_error("Failed to generate random boot id: %s", strerror(-r));
796 snprintf(as_uuid, sizeof(as_uuid),
797 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
798 SD_ID128_FORMAT_VAL(rnd));
799 char_array_0(as_uuid);
801 r = write_string_file(from, as_uuid);
803 log_error("Failed to write boot id: %s", strerror(-r));
807 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
808 log_error("Failed to bind mount boot id: %m");
810 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
811 log_warning("Failed to make boot id read-only: %m");
817 static int copy_devnodes(const char *dest) {
819 static const char devnodes[] =
829 _cleanup_umask_ mode_t u;
835 NULSTR_FOREACH(d, devnodes) {
836 _cleanup_free_ char *from = NULL, *to = NULL;
839 from = strappend("/dev/", d);
840 to = strjoin(dest, "/dev/", d, NULL);
844 if (stat(from, &st) < 0) {
846 if (errno != ENOENT) {
847 log_error("Failed to stat %s: %m", from);
851 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
853 log_error("%s is not a char or block device, cannot copy", from);
856 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
858 log_error("mknod(%s) failed: %m", dest);
866 static int setup_ptmx(const char *dest) {
867 _cleanup_free_ char *p = NULL;
869 p = strappend(dest, "/dev/ptmx");
873 if (symlink("pts/ptmx", p) < 0) {
874 log_error("Failed to create /dev/ptmx symlink: %m");
881 static int setup_dev_console(const char *dest, const char *console) {
883 _cleanup_free_ char *to = NULL;
885 _cleanup_umask_ mode_t u;
892 if (stat(console, &st) < 0) {
893 log_error("Failed to stat %s: %m", console);
896 } else if (!S_ISCHR(st.st_mode)) {
897 log_error("/dev/console is not a char device");
901 r = chmod_and_chown(console, 0600, 0, 0);
903 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
907 if (asprintf(&to, "%s/dev/console", dest) < 0)
910 /* We need to bind mount the right tty to /dev/console since
911 * ptys can only exist on pts file systems. To have something
912 * to bind mount things on we create a device node first, that
913 * has the right major/minor (note that the major minor
914 * doesn't actually matter here, since we mount it over
917 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
918 log_error("mknod() for /dev/console failed: %m");
922 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
923 log_error("Bind mount for /dev/console failed: %m");
930 static int setup_kmsg(const char *dest, int kmsg_socket) {
931 _cleanup_free_ char *from = NULL, *to = NULL;
933 _cleanup_umask_ mode_t u;
935 struct cmsghdr cmsghdr;
936 uint8_t buf[CMSG_SPACE(sizeof(int))];
939 .msg_control = &control,
940 .msg_controllen = sizeof(control),
942 struct cmsghdr *cmsg;
945 assert(kmsg_socket >= 0);
949 /* We create the kmsg FIFO as /dev/kmsg, but immediately
950 * delete it after bind mounting it to /proc/kmsg. While FIFOs
951 * on the reading side behave very similar to /proc/kmsg,
952 * their writing side behaves differently from /dev/kmsg in
953 * that writing blocks when nothing is reading. In order to
954 * avoid any problems with containers deadlocking due to this
955 * we simply make /dev/kmsg unavailable to the container. */
956 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
957 asprintf(&to, "%s/proc/kmsg", dest) < 0)
960 if (mkfifo(from, 0600) < 0) {
961 log_error("mkfifo() for /dev/kmsg failed: %m");
965 r = chmod_and_chown(from, 0600, 0, 0);
967 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
971 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
972 log_error("Bind mount for /proc/kmsg failed: %m");
976 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
978 log_error("Failed to open fifo: %m");
982 cmsg = CMSG_FIRSTHDR(&mh);
983 cmsg->cmsg_level = SOL_SOCKET;
984 cmsg->cmsg_type = SCM_RIGHTS;
985 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
986 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
988 mh.msg_controllen = cmsg->cmsg_len;
990 /* Store away the fd in the socket, so that it stays open as
991 * long as we run the child */
992 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
993 close_nointr_nofail(fd);
996 log_error("Failed to send FIFO fd: %m");
1000 /* And now make the FIFO unavailable as /dev/kmsg... */
1005 static int setup_hostname(void) {
1007 if (arg_share_system)
1010 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1016 static int setup_journal(const char *directory) {
1017 sd_id128_t machine_id, this_id;
1018 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1022 p = strappend(directory, "/etc/machine-id");
1026 r = read_one_line_file(p, &b);
1027 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1030 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1035 if (isempty(id) && arg_link_journal == LINK_AUTO)
1038 /* Verify validity */
1039 r = sd_id128_from_string(id, &machine_id);
1041 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1045 r = sd_id128_get_machine(&this_id);
1047 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1051 if (sd_id128_equal(machine_id, this_id)) {
1052 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1053 "Host and machine ids are equal (%s): refusing to link journals", id);
1054 if (arg_link_journal == LINK_AUTO)
1060 if (arg_link_journal == LINK_NO)
1064 p = strappend("/var/log/journal/", id);
1065 q = strjoin(directory, "/var/log/journal/", id, NULL);
1069 if (path_is_mount_point(p, false) > 0) {
1070 if (arg_link_journal != LINK_AUTO) {
1071 log_error("%s: already a mount point, refusing to use for journal", p);
1078 if (path_is_mount_point(q, false) > 0) {
1079 if (arg_link_journal != LINK_AUTO) {
1080 log_error("%s: already a mount point, refusing to use for journal", q);
1087 r = readlink_and_make_absolute(p, &d);
1089 if ((arg_link_journal == LINK_GUEST ||
1090 arg_link_journal == LINK_AUTO) &&
1093 r = mkdir_p(q, 0755);
1095 log_warning("failed to create directory %s: %m", q);
1099 if (unlink(p) < 0) {
1100 log_error("Failed to remove symlink %s: %m", p);
1103 } else if (r == -EINVAL) {
1105 if (arg_link_journal == LINK_GUEST &&
1108 if (errno == ENOTDIR) {
1109 log_error("%s already exists and is neither a symlink nor a directory", p);
1112 log_error("Failed to remove %s: %m", p);
1116 } else if (r != -ENOENT) {
1117 log_error("readlink(%s) failed: %m", p);
1121 if (arg_link_journal == LINK_GUEST) {
1123 if (symlink(q, p) < 0) {
1124 log_error("Failed to symlink %s to %s: %m", q, p);
1128 r = mkdir_p(q, 0755);
1130 log_warning("failed to create directory %s: %m", q);
1134 if (arg_link_journal == LINK_HOST) {
1135 r = mkdir_p(p, 0755);
1137 log_error("Failed to create %s: %m", p);
1141 } else if (access(p, F_OK) < 0)
1144 if (dir_is_empty(q) == 0) {
1145 log_error("%s not empty.", q);
1149 r = mkdir_p(q, 0755);
1151 log_error("Failed to create %s: %m", q);
1155 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1156 log_error("Failed to bind mount journal from host into guest: %m");
1163 static int setup_kdbus(const char *dest, const char *path) {
1169 p = strappenda(dest, "/dev/kdbus");
1170 if (mkdir(p, 0755) < 0) {
1171 log_error("Failed to create kdbus path: %m");
1175 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1176 log_error("Failed to mount kdbus domain path: %m");
1183 static int drop_capabilities(void) {
1184 return capability_bounding_set_drop(~arg_retain, false);
1187 static int register_machine(pid_t pid) {
1188 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1189 _cleanup_bus_unref_ sd_bus *bus = NULL;
1195 r = sd_bus_default_system(&bus);
1197 log_error("Failed to open system bus: %s", strerror(-r));
1201 if (arg_keep_unit) {
1202 r = sd_bus_call_method(
1204 "org.freedesktop.machine1",
1205 "/org/freedesktop/machine1",
1206 "org.freedesktop.machine1.Manager",
1212 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1216 strempty(arg_directory));
1218 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1220 r = sd_bus_message_new_method_call(
1223 "org.freedesktop.machine1",
1224 "/org/freedesktop/machine1",
1225 "org.freedesktop.machine1.Manager",
1228 log_error("Failed to create message: %s", strerror(-r));
1232 r = sd_bus_message_append(
1236 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1240 strempty(arg_directory));
1242 log_error("Failed to append message arguments: %s", strerror(-r));
1246 r = sd_bus_message_open_container(m, 'a', "(sv)");
1248 log_error("Failed to open container: %s", strerror(-r));
1252 if (!isempty(arg_slice)) {
1253 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1255 log_error("Failed to append slice: %s", strerror(-r));
1260 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1262 log_error("Failed to add device policy: %s", strerror(-r));
1266 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 8,
1267 /* Allow the container to
1268 * access and create the API
1269 * device nodes, so that
1270 * PrivateDevices= in the
1271 * container can work
1276 "/dev/random", "rwm",
1277 "/dev/urandom", "rwm",
1279 /* Allow the container
1280 * access to ptys. However,
1282 * container to ever create
1283 * these device nodes. */
1284 "/dev/pts/ptmx", "rw",
1287 log_error("Failed to add device whitelist: %s", strerror(-r));
1291 r = sd_bus_message_close_container(m);
1293 log_error("Failed to close container: %s", strerror(-r));
1297 r = sd_bus_call(bus, m, 0, &error, NULL);
1301 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1308 static int terminate_machine(pid_t pid) {
1309 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1310 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1311 _cleanup_bus_unref_ sd_bus *bus = NULL;
1318 r = sd_bus_default_system(&bus);
1320 log_error("Failed to open system bus: %s", strerror(-r));
1324 r = sd_bus_call_method(
1326 "org.freedesktop.machine1",
1327 "/org/freedesktop/machine1",
1328 "org.freedesktop.machine1.Manager",
1335 /* Note that the machine might already have been
1336 * cleaned up automatically, hence don't consider it a
1337 * failure if we cannot get the machine object. */
1338 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1342 r = sd_bus_message_read(reply, "o", &path);
1344 return bus_log_parse_error(r);
1346 r = sd_bus_call_method(
1348 "org.freedesktop.machine1",
1350 "org.freedesktop.machine1.Machine",
1356 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1363 static int reset_audit_loginuid(void) {
1364 _cleanup_free_ char *p = NULL;
1367 if (arg_share_system)
1370 r = read_one_line_file("/proc/self/loginuid", &p);
1374 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1378 /* Already reset? */
1379 if (streq(p, "4294967295"))
1382 r = write_string_file("/proc/self/loginuid", "4294967295");
1384 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1385 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1386 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1387 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1388 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1396 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1397 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1398 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1401 if (!arg_private_network)
1404 if (!arg_network_veth)
1407 /* Use two different interface name prefixes depending whether
1408 * we are in bridge mode or not. */
1409 if (arg_network_bridge)
1410 memcpy(iface_name, "vb-", 3);
1412 memcpy(iface_name, "ve-", 3);
1414 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1416 r = sd_rtnl_open(&rtnl, 0);
1418 log_error("Failed to connect to netlink: %s", strerror(-r));
1422 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1424 log_error("Failed to allocate netlink message: %s", strerror(-r));
1428 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1430 log_error("Failed to add netlink interface name: %s", strerror(-r));
1434 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1436 log_error("Failed to open netlink container: %s", strerror(-r));
1440 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1442 log_error("Failed to append netlink kind: %s", strerror(-r));
1446 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1448 log_error("Failed to open netlink container: %s", strerror(-r));
1452 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1454 log_error("Failed to open netlink container: %s", strerror(-r));
1458 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1460 log_error("Failed to add netlink interface name: %s", strerror(-r));
1464 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1466 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1470 r = sd_rtnl_message_close_container(m);
1472 log_error("Failed to close netlink container: %s", strerror(-r));
1476 r = sd_rtnl_message_close_container(m);
1478 log_error("Failed to close netlink container: %s", strerror(-r));
1482 r = sd_rtnl_message_close_container(m);
1484 log_error("Failed to close netlink container: %s", strerror(-r));
1488 r = sd_rtnl_call(rtnl, m, 0, NULL);
1490 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1497 static int setup_bridge(const char veth_name[]) {
1498 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1499 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1502 if (!arg_private_network)
1505 if (!arg_network_veth)
1508 if (!arg_network_bridge)
1511 bridge = (int) if_nametoindex(arg_network_bridge);
1513 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1517 r = sd_rtnl_open(&rtnl, 0);
1519 log_error("Failed to connect to netlink: %s", strerror(-r));
1523 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1525 log_error("Failed to allocate netlink message: %s", strerror(-r));
1529 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1531 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1535 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1537 log_error("Failed to add netlink master field: %s", strerror(-r));
1541 r = sd_rtnl_call(rtnl, m, 0, NULL);
1543 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1550 static int parse_interface(struct udev *udev, const char *name) {
1551 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1552 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1555 ifi = (int) if_nametoindex(name);
1557 log_error("Failed to resolve interface %s: %m", name);
1561 sprintf(ifi_str, "n%i", ifi);
1562 d = udev_device_new_from_device_id(udev, ifi_str);
1564 log_error("Failed to get udev device for interface %s: %m", name);
1568 if (udev_device_get_is_initialized(d) <= 0) {
1569 log_error("Network interface %s is not initialized yet.", name);
1576 static int move_network_interfaces(pid_t pid) {
1577 _cleanup_udev_unref_ struct udev *udev = NULL;
1578 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1582 if (!arg_private_network)
1585 if (strv_isempty(arg_network_interfaces))
1588 r = sd_rtnl_open(&rtnl, 0);
1590 log_error("Failed to connect to netlink: %s", strerror(-r));
1596 log_error("Failed to connect to udev.");
1600 STRV_FOREACH(i, arg_network_interfaces) {
1601 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1604 ifi = parse_interface(udev, *i);
1608 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1610 log_error("Failed to allocate netlink message: %s", strerror(-r));
1614 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1616 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1620 r = sd_rtnl_call(rtnl, m, 0, NULL);
1622 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1630 static int setup_macvlan(pid_t pid) {
1631 _cleanup_udev_unref_ struct udev *udev = NULL;
1632 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1636 if (!arg_private_network)
1639 if (strv_isempty(arg_network_macvlan))
1642 r = sd_rtnl_open(&rtnl, 0);
1644 log_error("Failed to connect to netlink: %s", strerror(-r));
1650 log_error("Failed to connect to udev.");
1654 STRV_FOREACH(i, arg_network_macvlan) {
1655 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1656 _cleanup_free_ char *n = NULL;
1659 ifi = parse_interface(udev, *i);
1663 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1665 log_error("Failed to allocate netlink message: %s", strerror(-r));
1669 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1671 log_error("Failed to add netlink interface index: %s", strerror(-r));
1675 n = strappend("mv-", *i);
1679 strshorten(n, IFNAMSIZ-1);
1681 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1683 log_error("Failed to add netlink interface name: %s", strerror(-r));
1687 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1689 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1693 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1695 log_error("Failed to open netlink container: %s", strerror(-r));
1699 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1701 log_error("Failed to append netlink kind: %s", strerror(-r));
1705 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1707 log_error("Failed to open netlink container: %s", strerror(-r));
1711 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1713 log_error("Failed to append macvlan mode: %s", strerror(-r));
1717 r = sd_rtnl_message_close_container(m);
1719 log_error("Failed to close netlink container: %s", strerror(-r));
1723 r = sd_rtnl_message_close_container(m);
1725 log_error("Failed to close netlink container: %s", strerror(-r));
1729 r = sd_rtnl_call(rtnl, m, 0, NULL);
1731 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1739 static int audit_still_doesnt_work_in_containers(void) {
1742 scmp_filter_ctx seccomp;
1746 Audit is broken in containers, much of the userspace audit
1747 hookup will fail if running inside a container. We don't
1748 care and just turn off creation of audit sockets.
1750 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1751 with EAFNOSUPPORT which audit userspace uses as indication
1752 that audit is disabled in the kernel.
1755 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1759 r = seccomp_add_secondary_archs(seccomp);
1761 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1765 r = seccomp_rule_add(
1767 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1770 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1771 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1773 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1777 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1779 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1783 r = seccomp_load(seccomp);
1785 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1788 seccomp_release(seccomp);
1796 static int setup_image(char **device_path, int *loop_nr) {
1797 struct loop_info64 info = {
1798 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1800 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1801 _cleanup_free_ char* loopdev = NULL;
1805 assert(device_path);
1808 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1810 log_error("Failed to open %s: %m", arg_image);
1814 if (fstat(fd, &st) < 0) {
1815 log_error("Failed to stat %s: %m", arg_image);
1819 if (S_ISBLK(st.st_mode)) {
1822 p = strdup(arg_image);
1836 if (!S_ISREG(st.st_mode)) {
1837 log_error("%s is not a regular file or block device: %m", arg_image);
1841 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1843 log_error("Failed to open /dev/loop-control: %m");
1847 nr = ioctl(control, LOOP_CTL_GET_FREE);
1849 log_error("Failed to allocate loop device: %m");
1853 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1856 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1858 log_error("Failed to open loop device %s: %m", loopdev);
1862 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1863 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1868 info.lo_flags |= LO_FLAGS_READ_ONLY;
1870 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1871 log_error("Failed to set loopback settings on %s: %m", loopdev);
1875 *device_path = loopdev;
1886 static int dissect_image(
1894 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1895 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1896 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1897 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1898 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1899 _cleanup_udev_unref_ struct udev *udev = NULL;
1900 struct udev_list_entry *first, *item;
1901 const char *pttype = NULL;
1907 assert(root_device);
1908 assert(home_device);
1912 b = blkid_new_probe();
1917 r = blkid_probe_set_device(b, fd, 0, 0);
1922 log_error("Failed to set device on blkid probe: %m");
1926 blkid_probe_enable_partitions(b, 1);
1927 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1930 r = blkid_do_safeprobe(b);
1931 if (r == -2 || r == 1) {
1932 log_error("Failed to identify any partition table on %s.\n"
1933 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1935 } else if (r != 0) {
1938 log_error("Failed to probe: %m");
1942 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1943 if (!streq_ptr(pttype, "gpt")) {
1944 log_error("Image %s does not carry a GUID Partition Table.\n"
1945 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1950 pl = blkid_probe_get_partitions(b);
1955 log_error("Failed to list partitions of %s", arg_image);
1963 if (fstat(fd, &st) < 0) {
1964 log_error("Failed to stat block device: %m");
1968 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1972 e = udev_enumerate_new(udev);
1976 r = udev_enumerate_add_match_parent(e, d);
1980 r = udev_enumerate_scan_devices(e);
1982 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
1986 first = udev_enumerate_get_list_entry(e);
1987 udev_list_entry_foreach(item, first) {
1988 _cleanup_udev_device_unref_ struct udev_device *q;
1989 const char *stype, *node;
1996 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2001 log_error("Failed to get partition device of %s: %m", arg_image);
2005 qn = udev_device_get_devnum(q);
2009 if (st.st_rdev == qn)
2012 node = udev_device_get_devnode(q);
2016 pp = blkid_partlist_devno_to_partition(pl, qn);
2020 nr = blkid_partition_get_partno(pp);
2024 stype = blkid_partition_get_type_string(pp);
2028 if (sd_id128_from_string(stype, &type_id) < 0)
2031 if (sd_id128_equal(type_id, GPT_HOME)) {
2033 if (home && nr >= home_nr)
2038 home = strdup(node);
2041 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2043 if (srv && nr >= srv_nr)
2052 #ifdef GPT_ROOT_NATIVE
2053 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2055 if (root && nr >= root_nr)
2060 root = strdup(node);
2065 #ifdef GPT_ROOT_SECONDARY
2066 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2068 if (secondary_root && nr >= secondary_root_nr)
2071 secondary_root_nr = nr;
2072 free(secondary_root);
2073 secondary_root = strdup(node);
2074 if (!secondary_root)
2080 if (!root && !secondary_root) {
2081 log_error("Failed to identify root partition in disk image %s.\n"
2082 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2087 *root_device = root;
2090 } else if (secondary_root) {
2091 *root_device = secondary_root;
2092 secondary_root = NULL;
2097 *home_device = home;
2108 log_error("--image= is not supported, compiled without blkid support.");
2113 static int mount_device(const char *what, const char *where, const char *directory) {
2115 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2116 const char *fstype, *p;
2123 p = strappenda(where, directory);
2128 b = blkid_new_probe_from_filename(what);
2132 log_error("Failed to allocate prober for %s: %m", what);
2136 blkid_probe_enable_superblocks(b, 1);
2137 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2140 r = blkid_do_safeprobe(b);
2141 if (r == -1 || r == 1) {
2142 log_error("Cannot determine file system type of %s", what);
2144 } else if (r != 0) {
2147 log_error("Failed to probe %s: %m", what);
2152 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2155 log_error("Failed to determine file system type of %s", what);
2159 if (streq(fstype, "crypto_LUKS")) {
2160 log_error("nspawn currently does not support LUKS disk images.");
2164 if (mount(what, p, fstype, arg_read_only ? MS_NODEV|MS_RDONLY : 0, NULL) < 0) {
2165 log_error("Failed to mount %s: %m", what);
2171 log_error("--image= is not supported, compiled without blkid support.");
2176 static int mount_devices(const char *where, const char *root_device, const char *home_device, const char *srv_device) {
2182 r = mount_device(root_device, arg_directory, NULL);
2184 log_error("Failed to mount root directory: %s", strerror(-r));
2190 r = mount_device(home_device, arg_directory, "/home");
2192 log_error("Failed to mount home directory: %s", strerror(-r));
2198 r = mount_device(srv_device, arg_directory, "/srv");
2200 log_error("Failed to mount server data directory: %s", strerror(-r));
2208 static void loop_remove(int nr, int *image_fd) {
2209 _cleanup_close_ int control = -1;
2214 if (image_fd && *image_fd >= 0) {
2215 ioctl(*image_fd, LOOP_CLR_FD);
2216 close_nointr_nofail(*image_fd);
2220 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2224 ioctl(control, LOOP_CTL_REMOVE, nr);
2227 int main(int argc, char *argv[]) {
2229 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2230 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1, image_fd = -1;
2231 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
2232 _cleanup_fdset_free_ FDSet *fds = NULL;
2233 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2234 const char *console = NULL;
2235 char veth_name[IFNAMSIZ];
2236 bool secondary = false;
2240 log_parse_environment();
2243 k = parse_argv(argc, argv);
2252 if (arg_directory) {
2255 p = path_make_absolute_cwd(arg_directory);
2256 free(arg_directory);
2259 arg_directory = get_current_dir_name();
2261 if (!arg_directory) {
2262 log_error("Failed to determine path, please use -D.");
2265 path_kill_slashes(arg_directory);
2269 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2275 hostname_cleanup(arg_machine, false);
2276 if (isempty(arg_machine)) {
2277 log_error("Failed to determine machine name automatically, please use -M.");
2282 if (geteuid() != 0) {
2283 log_error("Need to be root.");
2287 if (sd_booted() <= 0) {
2288 log_error("Not running on a systemd system.");
2293 n_fd_passed = sd_listen_fds(false);
2294 if (n_fd_passed > 0) {
2295 k = fdset_new_listen_fds(&fds, false);
2297 log_error("Failed to collect file descriptors: %s", strerror(-k));
2301 fdset_close_others(fds);
2304 if (arg_directory) {
2305 if (path_equal(arg_directory, "/")) {
2306 log_error("Spawning container on root directory not supported.");
2311 if (path_is_os_tree(arg_directory) <= 0) {
2312 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2318 p = strappenda(arg_directory,
2319 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2320 if (access(p, F_OK) < 0) {
2321 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2327 char template[] = "/tmp/nspawn-root-XXXXXX";
2329 if (!mkdtemp(template)) {
2330 log_error("Failed to create temporary directory: %m");
2335 arg_directory = strdup(template);
2336 if (!arg_directory) {
2341 image_fd = setup_image(&device_path, &loop_nr);
2347 r = dissect_image(image_fd, &root_device, &home_device, &srv_device, &secondary);
2352 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2354 log_error("Failed to acquire pseudo tty: %m");
2358 console = ptsname(master);
2360 log_error("Failed to determine tty name: %m");
2365 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2367 if (unlockpt(master) < 0) {
2368 log_error("Failed to unlock tty: %m");
2372 if (access("/dev/kdbus/control", F_OK) >= 0) {
2374 if (arg_share_system) {
2375 kdbus_domain = strdup("/dev/kdbus");
2376 if (!kdbus_domain) {
2383 ns = strappenda("machine-", arg_machine);
2384 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2386 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2388 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2392 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2393 log_error("Failed to create kmsg socket pair: %m");
2397 sd_notify(0, "READY=1");
2399 assert_se(sigemptyset(&mask) == 0);
2400 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2401 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2406 sync_fd = eventfd(0, EFD_CLOEXEC);
2408 log_error("Failed to create event fd: %m");
2412 pid = syscall(__NR_clone,
2413 SIGCHLD|CLONE_NEWNS|
2414 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2415 (arg_private_network ? CLONE_NEWNET : 0), NULL);
2417 if (errno == EINVAL)
2418 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2420 log_error("clone() failed: %m");
2427 const char *home = NULL;
2428 uid_t uid = (uid_t) -1;
2429 gid_t gid = (gid_t) -1;
2431 const char *envp[] = {
2432 "PATH=" DEFAULT_PATH_SPLIT_USR,
2433 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2438 NULL, /* container_uuid */
2439 NULL, /* LISTEN_FDS */
2440 NULL, /* LISTEN_PID */
2446 envp[n_env] = strv_find_prefix(environ, "TERM=");
2450 close_nointr_nofail(master);
2453 close_nointr(STDIN_FILENO);
2454 close_nointr(STDOUT_FILENO);
2455 close_nointr(STDERR_FILENO);
2457 close_nointr_nofail(kmsg_socket_pair[0]);
2458 kmsg_socket_pair[0] = -1;
2460 reset_all_signal_handlers();
2462 assert_se(sigemptyset(&mask) == 0);
2463 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2465 k = open_terminal(console, O_RDWR);
2466 if (k != STDIN_FILENO) {
2468 close_nointr_nofail(k);
2472 log_error("Failed to open console: %s", strerror(-k));
2476 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2477 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2478 log_error("Failed to duplicate console: %m");
2483 log_error("setsid() failed: %m");
2487 if (reset_audit_loginuid() < 0)
2490 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2491 log_error("PR_SET_PDEATHSIG failed: %m");
2495 /* Mark everything as slave, so that we still
2496 * receive mounts from the real root, but don't
2497 * propagate mounts to the real root. */
2498 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2499 log_error("MS_SLAVE|MS_REC failed: %m");
2503 if (mount_devices(arg_directory, root_device, home_device, srv_device) < 0)
2506 /* Turn directory into bind mount */
2507 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2508 log_error("Failed to make bind mount.");
2513 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2514 log_error("Failed to make read-only.");
2518 if (mount_all(arg_directory) < 0)
2521 if (copy_devnodes(arg_directory) < 0)
2524 if (setup_ptmx(arg_directory) < 0)
2527 dev_setup(arg_directory);
2529 if (audit_still_doesnt_work_in_containers() < 0)
2532 if (setup_dev_console(arg_directory, console) < 0)
2535 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2538 close_nointr_nofail(kmsg_socket_pair[1]);
2539 kmsg_socket_pair[1] = -1;
2541 if (setup_boot_id(arg_directory) < 0)
2544 if (setup_timezone(arg_directory) < 0)
2547 if (setup_resolv_conf(arg_directory) < 0)
2550 if (setup_journal(arg_directory) < 0)
2553 if (mount_binds(arg_directory, arg_bind, 0) < 0)
2556 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2559 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2562 if (chdir(arg_directory) < 0) {
2563 log_error("chdir(%s) failed: %m", arg_directory);
2567 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2568 log_error("mount(MS_MOVE) failed: %m");
2572 if (chroot(".") < 0) {
2573 log_error("chroot() failed: %m");
2577 if (chdir("/") < 0) {
2578 log_error("chdir() failed: %m");
2584 if (arg_private_network)
2587 if (drop_capabilities() < 0) {
2588 log_error("drop_capabilities() failed: %m");
2594 /* Note that this resolves user names
2595 * inside the container, and hence
2596 * accesses the NSS modules from the
2597 * container and not the host. This is
2600 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
2601 log_error("get_user_creds() failed: %m");
2605 if (mkdir_parents_label(home, 0775) < 0) {
2606 log_error("mkdir_parents_label() failed: %m");
2610 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
2611 log_error("mkdir_safe_label() failed: %m");
2615 if (initgroups((const char*)arg_user, gid) < 0) {
2616 log_error("initgroups() failed: %m");
2620 if (setresgid(gid, gid, gid) < 0) {
2621 log_error("setregid() failed: %m");
2625 if (setresuid(uid, uid, uid) < 0) {
2626 log_error("setreuid() failed: %m");
2630 /* Reset everything fully to 0, just in case */
2632 if (setgroups(0, NULL) < 0) {
2633 log_error("setgroups() failed: %m");
2637 if (setresgid(0, 0, 0) < 0) {
2638 log_error("setregid() failed: %m");
2642 if (setresuid(0, 0, 0) < 0) {
2643 log_error("setreuid() failed: %m");
2648 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2649 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2650 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2655 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2656 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2662 if (fdset_size(fds) > 0) {
2663 k = fdset_cloexec(fds, false);
2665 log_error("Failed to unset O_CLOEXEC for file descriptors.");
2669 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2670 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2678 if (arg_personality != 0xffffffffLU) {
2679 if (personality(arg_personality) < 0) {
2680 log_error("personality() failed: %m");
2683 } else if (secondary) {
2684 if (personality(PER_LINUX32) < 0) {
2685 log_error("personality() failed: %m");
2690 eventfd_read(sync_fd, &x);
2691 close_nointr_nofail(sync_fd);
2694 if (!strv_isempty(arg_setenv)) {
2697 n = strv_env_merge(2, envp, arg_setenv);
2705 env_use = (char**) envp;
2708 if (arg_selinux_context)
2709 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2710 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2716 /* Automatically search for the init system */
2718 l = 1 + argc - optind;
2719 a = newa(char*, l + 1);
2720 memcpy(a + 1, argv + optind, l * sizeof(char*));
2722 a[0] = (char*) "/usr/lib/systemd/systemd";
2723 execve(a[0], a, env_use);
2725 a[0] = (char*) "/lib/systemd/systemd";
2726 execve(a[0], a, env_use);
2728 a[0] = (char*) "/sbin/init";
2729 execve(a[0], a, env_use);
2730 } else if (argc > optind)
2731 execvpe(argv[optind], argv + optind, env_use);
2733 chdir(home ? home : "/root");
2734 execle("/bin/bash", "-bash", NULL, env_use);
2735 execle("/bin/sh", "-sh", NULL, env_use);
2738 log_error("execv() failed: %m");
2741 _exit(EXIT_FAILURE);
2747 r = register_machine(pid);
2751 r = move_network_interfaces(pid);
2755 r = setup_veth(pid, veth_name);
2759 r = setup_bridge(veth_name);
2763 r = setup_macvlan(pid);
2767 eventfd_write(sync_fd, 1);
2768 close_nointr_nofail(sync_fd);
2771 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
2780 /* Kill if it is not dead yet anyway */
2781 terminate_machine(pid);
2783 /* Redundant, but better safe than sorry */
2786 k = wait_for_terminate(pid, &status);
2794 if (status.si_code == CLD_EXITED) {
2795 r = status.si_status;
2796 if (status.si_status != 0) {
2797 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2802 log_debug("Container %s exited successfully.", arg_machine);
2804 } else if (status.si_code == CLD_KILLED &&
2805 status.si_status == SIGINT) {
2808 log_info("Container %s has been shut down.", arg_machine);
2811 } else if (status.si_code == CLD_KILLED &&
2812 status.si_status == SIGHUP) {
2815 log_info("Container %s is being rebooted.", arg_machine);
2817 } else if (status.si_code == CLD_KILLED ||
2818 status.si_code == CLD_DUMPED) {
2820 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2824 log_error("Container %s failed due to unknown reason.", arg_machine);
2831 loop_remove(loop_nr, &image_fd);
2836 free(arg_directory);
2839 strv_free(arg_setenv);
2840 strv_free(arg_network_interfaces);
2841 strv_free(arg_network_macvlan);
2842 strv_free(arg_bind);
2843 strv_free(arg_bind_ro);