1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
91 #include "seccomp-util.h"
94 typedef enum LinkJournal {
101 static char *arg_directory = NULL;
102 static char *arg_user = NULL;
103 static sd_id128_t arg_uuid = {};
104 static char *arg_machine = NULL;
105 static const char *arg_selinux_context = NULL;
106 static const char *arg_selinux_apifs_context = NULL;
107 static const char *arg_slice = NULL;
108 static bool arg_private_network = false;
109 static bool arg_read_only = false;
110 static bool arg_boot = false;
111 static LinkJournal arg_link_journal = LINK_AUTO;
112 static uint64_t arg_retain =
113 (1ULL << CAP_CHOWN) |
114 (1ULL << CAP_DAC_OVERRIDE) |
115 (1ULL << CAP_DAC_READ_SEARCH) |
116 (1ULL << CAP_FOWNER) |
117 (1ULL << CAP_FSETID) |
118 (1ULL << CAP_IPC_OWNER) |
120 (1ULL << CAP_LEASE) |
121 (1ULL << CAP_LINUX_IMMUTABLE) |
122 (1ULL << CAP_NET_BIND_SERVICE) |
123 (1ULL << CAP_NET_BROADCAST) |
124 (1ULL << CAP_NET_RAW) |
125 (1ULL << CAP_SETGID) |
126 (1ULL << CAP_SETFCAP) |
127 (1ULL << CAP_SETPCAP) |
128 (1ULL << CAP_SETUID) |
129 (1ULL << CAP_SYS_ADMIN) |
130 (1ULL << CAP_SYS_CHROOT) |
131 (1ULL << CAP_SYS_NICE) |
132 (1ULL << CAP_SYS_PTRACE) |
133 (1ULL << CAP_SYS_TTY_CONFIG) |
134 (1ULL << CAP_SYS_RESOURCE) |
135 (1ULL << CAP_SYS_BOOT) |
136 (1ULL << CAP_AUDIT_WRITE) |
137 (1ULL << CAP_AUDIT_CONTROL) |
139 static char **arg_bind = NULL;
140 static char **arg_bind_ro = NULL;
141 static char **arg_setenv = NULL;
142 static bool arg_quiet = false;
143 static bool arg_share_system = false;
144 static bool arg_register = true;
145 static bool arg_keep_unit = false;
146 static char **arg_network_interfaces = NULL;
147 static char **arg_network_macvlan = NULL;
148 static bool arg_network_veth = false;
149 static const char *arg_network_bridge = NULL;
150 static unsigned long arg_personality = 0xffffffffLU;
151 static const char *arg_image = NULL;
153 static int help(void) {
155 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
156 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
157 " -h --help Show this help\n"
158 " --version Print version string\n"
159 " -q --quiet Do not show status information\n"
160 " -D --directory=PATH Root directory for the container\n"
161 " -i --image=PATH File system device or image for the container\n"
162 " -b --boot Boot up full system (i.e. invoke init)\n"
163 " -u --user=USER Run the command under specified user or uid\n"
164 " -M --machine=NAME Set the machine name for the container\n"
165 " --uuid=UUID Set a specific machine UUID for the container\n"
166 " -S --slice=SLICE Place the container in the specified slice\n"
167 " --private-network Disable network in container\n"
168 " --network-interface=INTERFACE\n"
169 " Assign an existing network interface to the\n"
171 " --network-macvlan=INTERFACE\n"
172 " Create a macvlan network interface based on an\n"
173 " existing network interface to the container\n"
174 " --network-veth Add a virtual ethernet connection between host\n"
176 " --network-bridge=INTERFACE\n"
177 " Add a virtual ethernet connection between host\n"
178 " and container and add it to an existing bridge on\n"
180 " -Z --selinux-context=SECLABEL\n"
181 " Set the SELinux security context to be used by\n"
182 " processes in the container\n"
183 " -L --selinux-apifs-context=SECLABEL\n"
184 " Set the SELinux security context to be used by\n"
185 " API/tmpfs file systems in the container\n"
186 " --capability=CAP In addition to the default, retain specified\n"
188 " --drop-capability=CAP Drop the specified capability from the default set\n"
189 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
190 " -j Equivalent to --link-journal=host\n"
191 " --read-only Mount the root directory read-only\n"
192 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
194 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
195 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
196 " --share-system Share system namespaces with host\n"
197 " --register=BOOLEAN Register container as machine\n"
198 " --keep-unit Do not register a scope for the machine, reuse\n"
199 " the service unit nspawn is running in\n",
200 program_invocation_short_name);
205 static int parse_argv(int argc, char *argv[]) {
221 ARG_NETWORK_INTERFACE,
228 static const struct option options[] = {
229 { "help", no_argument, NULL, 'h' },
230 { "version", no_argument, NULL, ARG_VERSION },
231 { "directory", required_argument, NULL, 'D' },
232 { "user", required_argument, NULL, 'u' },
233 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
234 { "boot", no_argument, NULL, 'b' },
235 { "uuid", required_argument, NULL, ARG_UUID },
236 { "read-only", no_argument, NULL, ARG_READ_ONLY },
237 { "capability", required_argument, NULL, ARG_CAPABILITY },
238 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
239 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
240 { "bind", required_argument, NULL, ARG_BIND },
241 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
242 { "machine", required_argument, NULL, 'M' },
243 { "slice", required_argument, NULL, 'S' },
244 { "setenv", required_argument, NULL, ARG_SETENV },
245 { "selinux-context", required_argument, NULL, 'Z' },
246 { "selinux-apifs-context", required_argument, NULL, 'L' },
247 { "quiet", no_argument, NULL, 'q' },
248 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
249 { "register", required_argument, NULL, ARG_REGISTER },
250 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
251 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
252 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
253 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
254 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
255 { "personality", required_argument, NULL, ARG_PERSONALITY },
256 { "image", required_argument, NULL, 'i' },
261 uint64_t plus = 0, minus = 0;
266 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
274 puts(PACKAGE_STRING);
275 puts(SYSTEMD_FEATURES);
280 arg_directory = canonicalize_file_name(optarg);
281 if (!arg_directory) {
282 log_error("Invalid root directory: %m");
294 arg_user = strdup(optarg);
300 case ARG_NETWORK_BRIDGE:
301 arg_network_bridge = optarg;
305 case ARG_NETWORK_VETH:
306 arg_network_veth = true;
307 arg_private_network = true;
310 case ARG_NETWORK_INTERFACE:
311 if (strv_extend(&arg_network_interfaces, optarg) < 0)
314 arg_private_network = true;
317 case ARG_NETWORK_MACVLAN:
318 if (strv_extend(&arg_network_macvlan, optarg) < 0)
323 case ARG_PRIVATE_NETWORK:
324 arg_private_network = true;
332 r = sd_id128_from_string(optarg, &arg_uuid);
334 log_error("Invalid UUID: %s", optarg);
344 if (isempty(optarg)) {
349 if (!hostname_is_valid(optarg)) {
350 log_error("Invalid machine name: %s", optarg);
355 arg_machine = strdup(optarg);
363 arg_selinux_context = optarg;
367 arg_selinux_apifs_context = optarg;
371 arg_read_only = true;
375 case ARG_DROP_CAPABILITY: {
379 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
380 _cleanup_free_ char *t;
383 t = strndup(word, length);
387 if (streq(t, "all")) {
388 if (c == ARG_CAPABILITY)
389 plus = (uint64_t) -1;
391 minus = (uint64_t) -1;
393 if (cap_from_name(t, &cap) < 0) {
394 log_error("Failed to parse capability %s.", t);
398 if (c == ARG_CAPABILITY)
399 plus |= 1ULL << (uint64_t) cap;
401 minus |= 1ULL << (uint64_t) cap;
409 arg_link_journal = LINK_GUEST;
412 case ARG_LINK_JOURNAL:
413 if (streq(optarg, "auto"))
414 arg_link_journal = LINK_AUTO;
415 else if (streq(optarg, "no"))
416 arg_link_journal = LINK_NO;
417 else if (streq(optarg, "guest"))
418 arg_link_journal = LINK_GUEST;
419 else if (streq(optarg, "host"))
420 arg_link_journal = LINK_HOST;
422 log_error("Failed to parse link journal mode %s", optarg);
430 _cleanup_free_ char *a = NULL, *b = NULL;
434 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
436 e = strchr(optarg, ':');
438 a = strndup(optarg, e - optarg);
448 if (!path_is_absolute(a) || !path_is_absolute(b)) {
449 log_error("Invalid bind mount specification: %s", optarg);
453 r = strv_extend(x, a);
457 r = strv_extend(x, b);
467 if (!env_assignment_is_valid(optarg)) {
468 log_error("Environment variable assignment '%s' is not valid.", optarg);
472 n = strv_env_set(arg_setenv, optarg);
476 strv_free(arg_setenv);
485 case ARG_SHARE_SYSTEM:
486 arg_share_system = true;
490 r = parse_boolean(optarg);
492 log_error("Failed to parse --register= argument: %s", optarg);
500 arg_keep_unit = true;
503 case ARG_PERSONALITY:
505 arg_personality = personality_from_string(optarg);
506 if (arg_personality == 0xffffffffLU) {
507 log_error("Unknown or unsupported personality '%s'.", optarg);
517 assert_not_reached("Unhandled option");
521 if (arg_share_system)
522 arg_register = false;
524 if (arg_boot && arg_share_system) {
525 log_error("--boot and --share-system may not be combined.");
529 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
530 log_error("--keep-unit may not be used when invoked from a user session.");
534 if (arg_directory && arg_image) {
535 log_error("--directory= and --image= may not be combined.");
539 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
544 static int mount_all(const char *dest) {
546 typedef struct MountPoint {
555 static const MountPoint mount_table[] = {
556 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
557 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
558 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
559 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
560 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
561 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
562 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
563 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
565 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
566 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
573 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
574 _cleanup_free_ char *where = NULL;
576 _cleanup_free_ char *options = NULL;
581 where = strjoin(dest, "/", mount_table[k].where, NULL);
585 t = path_is_mount_point(where, true);
587 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
595 /* Skip this entry if it is not a remount. */
596 if (mount_table[k].what && t > 0)
599 mkdir_p(where, 0755);
602 if (arg_selinux_apifs_context &&
603 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
604 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
611 o = mount_table[k].options;
614 if (mount(mount_table[k].what,
617 mount_table[k].flags,
619 mount_table[k].fatal) {
621 log_error("mount(%s) failed: %m", where);
631 static int mount_binds(const char *dest, char **l, unsigned long flags) {
634 STRV_FOREACH_PAIR(x, y, l) {
636 struct stat source_st, dest_st;
639 if (stat(*x, &source_st) < 0) {
640 log_error("Failed to stat %s: %m", *x);
644 where = strappenda(dest, *y);
645 r = stat(where, &dest_st);
647 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
648 log_error("The file types of %s and %s do not match. Refusing bind mount",
652 } else if (errno == ENOENT) {
653 r = mkdir_parents_label(where, 0755);
655 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
659 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
662 /* Create the mount point, but be conservative -- refuse to create block
663 * and char devices. */
664 if (S_ISDIR(source_st.st_mode))
665 mkdir_label(where, 0755);
666 else if (S_ISFIFO(source_st.st_mode))
668 else if (S_ISSOCK(source_st.st_mode))
669 mknod(where, 0644 | S_IFSOCK, 0);
670 else if (S_ISREG(source_st.st_mode))
673 log_error("Refusing to create mountpoint for file: %s", *x);
677 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
678 log_error("mount(%s) failed: %m", where);
682 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
683 log_error("mount(%s) failed: %m", where);
691 static int setup_timezone(const char *dest) {
692 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
698 /* Fix the timezone, if possible */
699 r = readlink_malloc("/etc/localtime", &p);
701 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
705 z = path_startswith(p, "../usr/share/zoneinfo/");
707 z = path_startswith(p, "/usr/share/zoneinfo/");
709 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
713 where = strappend(dest, "/etc/localtime");
717 r = readlink_malloc(where, &q);
719 y = path_startswith(q, "../usr/share/zoneinfo/");
721 y = path_startswith(q, "/usr/share/zoneinfo/");
724 /* Already pointing to the right place? Then do nothing .. */
725 if (y && streq(y, z))
729 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
733 if (access(check, F_OK) < 0) {
734 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
738 what = strappend("../usr/share/zoneinfo/", z);
743 if (symlink(what, where) < 0) {
744 log_error("Failed to correct timezone of container: %m");
751 static int setup_resolv_conf(const char *dest) {
752 char _cleanup_free_ *where = NULL;
756 if (arg_private_network)
759 /* Fix resolv.conf, if possible */
760 where = strappend(dest, "/etc/resolv.conf");
764 /* We don't really care for the results of this really. If it
765 * fails, it fails, but meh... */
766 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
771 static int setup_boot_id(const char *dest) {
772 _cleanup_free_ char *from = NULL, *to = NULL;
779 if (arg_share_system)
782 /* Generate a new randomized boot ID, so that each boot-up of
783 * the container gets a new one */
785 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
786 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
790 r = sd_id128_randomize(&rnd);
792 log_error("Failed to generate random boot id: %s", strerror(-r));
796 snprintf(as_uuid, sizeof(as_uuid),
797 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
798 SD_ID128_FORMAT_VAL(rnd));
799 char_array_0(as_uuid);
801 r = write_string_file(from, as_uuid);
803 log_error("Failed to write boot id: %s", strerror(-r));
807 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
808 log_error("Failed to bind mount boot id: %m");
810 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
811 log_warning("Failed to make boot id read-only: %m");
817 static int copy_devnodes(const char *dest) {
819 static const char devnodes[] =
829 _cleanup_umask_ mode_t u;
835 NULSTR_FOREACH(d, devnodes) {
836 _cleanup_free_ char *from = NULL, *to = NULL;
839 from = strappend("/dev/", d);
840 to = strjoin(dest, "/dev/", d, NULL);
844 if (stat(from, &st) < 0) {
846 if (errno != ENOENT) {
847 log_error("Failed to stat %s: %m", from);
851 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
853 log_error("%s is not a char or block device, cannot copy", from);
856 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
858 log_error("mknod(%s) failed: %m", dest);
866 static int setup_ptmx(const char *dest) {
867 _cleanup_free_ char *p = NULL;
869 p = strappend(dest, "/dev/ptmx");
873 if (symlink("pts/ptmx", p) < 0) {
874 log_error("Failed to create /dev/ptmx symlink: %m");
881 static int setup_dev_console(const char *dest, const char *console) {
882 _cleanup_umask_ mode_t u;
892 if (stat("/dev/null", &st) < 0) {
893 log_error("Failed to stat /dev/null: %m");
897 r = chmod_and_chown(console, 0600, 0, 0);
899 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
903 /* We need to bind mount the right tty to /dev/console since
904 * ptys can only exist on pts file systems. To have something
905 * to bind mount things on we create a device node first, and
906 * use /dev/null for that since we the cgroups device policy
907 * allows us to create that freely, while we cannot create
908 * /dev/console. (Note that the major minor doesn't actually
909 * matter here, since we mount it over anyway). */
911 to = strappenda(dest, "/dev/console");
912 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
913 log_error("mknod() for /dev/console failed: %m");
917 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
918 log_error("Bind mount for /dev/console failed: %m");
925 static int setup_kmsg(const char *dest, int kmsg_socket) {
926 _cleanup_free_ char *from = NULL, *to = NULL;
928 _cleanup_umask_ mode_t u;
930 struct cmsghdr cmsghdr;
931 uint8_t buf[CMSG_SPACE(sizeof(int))];
934 .msg_control = &control,
935 .msg_controllen = sizeof(control),
937 struct cmsghdr *cmsg;
940 assert(kmsg_socket >= 0);
944 /* We create the kmsg FIFO as /dev/kmsg, but immediately
945 * delete it after bind mounting it to /proc/kmsg. While FIFOs
946 * on the reading side behave very similar to /proc/kmsg,
947 * their writing side behaves differently from /dev/kmsg in
948 * that writing blocks when nothing is reading. In order to
949 * avoid any problems with containers deadlocking due to this
950 * we simply make /dev/kmsg unavailable to the container. */
951 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
952 asprintf(&to, "%s/proc/kmsg", dest) < 0)
955 if (mkfifo(from, 0600) < 0) {
956 log_error("mkfifo() for /dev/kmsg failed: %m");
960 r = chmod_and_chown(from, 0600, 0, 0);
962 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
966 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
967 log_error("Bind mount for /proc/kmsg failed: %m");
971 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
973 log_error("Failed to open fifo: %m");
977 cmsg = CMSG_FIRSTHDR(&mh);
978 cmsg->cmsg_level = SOL_SOCKET;
979 cmsg->cmsg_type = SCM_RIGHTS;
980 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
981 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
983 mh.msg_controllen = cmsg->cmsg_len;
985 /* Store away the fd in the socket, so that it stays open as
986 * long as we run the child */
987 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
988 close_nointr_nofail(fd);
991 log_error("Failed to send FIFO fd: %m");
995 /* And now make the FIFO unavailable as /dev/kmsg... */
1000 static int setup_hostname(void) {
1002 if (arg_share_system)
1005 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1011 static int setup_journal(const char *directory) {
1012 sd_id128_t machine_id, this_id;
1013 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1017 p = strappend(directory, "/etc/machine-id");
1021 r = read_one_line_file(p, &b);
1022 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1025 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1030 if (isempty(id) && arg_link_journal == LINK_AUTO)
1033 /* Verify validity */
1034 r = sd_id128_from_string(id, &machine_id);
1036 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1040 r = sd_id128_get_machine(&this_id);
1042 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1046 if (sd_id128_equal(machine_id, this_id)) {
1047 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1048 "Host and machine ids are equal (%s): refusing to link journals", id);
1049 if (arg_link_journal == LINK_AUTO)
1055 if (arg_link_journal == LINK_NO)
1059 p = strappend("/var/log/journal/", id);
1060 q = strjoin(directory, "/var/log/journal/", id, NULL);
1064 if (path_is_mount_point(p, false) > 0) {
1065 if (arg_link_journal != LINK_AUTO) {
1066 log_error("%s: already a mount point, refusing to use for journal", p);
1073 if (path_is_mount_point(q, false) > 0) {
1074 if (arg_link_journal != LINK_AUTO) {
1075 log_error("%s: already a mount point, refusing to use for journal", q);
1082 r = readlink_and_make_absolute(p, &d);
1084 if ((arg_link_journal == LINK_GUEST ||
1085 arg_link_journal == LINK_AUTO) &&
1088 r = mkdir_p(q, 0755);
1090 log_warning("failed to create directory %s: %m", q);
1094 if (unlink(p) < 0) {
1095 log_error("Failed to remove symlink %s: %m", p);
1098 } else if (r == -EINVAL) {
1100 if (arg_link_journal == LINK_GUEST &&
1103 if (errno == ENOTDIR) {
1104 log_error("%s already exists and is neither a symlink nor a directory", p);
1107 log_error("Failed to remove %s: %m", p);
1111 } else if (r != -ENOENT) {
1112 log_error("readlink(%s) failed: %m", p);
1116 if (arg_link_journal == LINK_GUEST) {
1118 if (symlink(q, p) < 0) {
1119 log_error("Failed to symlink %s to %s: %m", q, p);
1123 r = mkdir_p(q, 0755);
1125 log_warning("failed to create directory %s: %m", q);
1129 if (arg_link_journal == LINK_HOST) {
1130 r = mkdir_p(p, 0755);
1132 log_error("Failed to create %s: %m", p);
1136 } else if (access(p, F_OK) < 0)
1139 if (dir_is_empty(q) == 0) {
1140 log_error("%s not empty.", q);
1144 r = mkdir_p(q, 0755);
1146 log_error("Failed to create %s: %m", q);
1150 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1151 log_error("Failed to bind mount journal from host into guest: %m");
1158 static int setup_kdbus(const char *dest, const char *path) {
1164 p = strappenda(dest, "/dev/kdbus");
1165 if (mkdir(p, 0755) < 0) {
1166 log_error("Failed to create kdbus path: %m");
1170 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1171 log_error("Failed to mount kdbus domain path: %m");
1178 static int drop_capabilities(void) {
1179 return capability_bounding_set_drop(~arg_retain, false);
1182 static int register_machine(pid_t pid) {
1183 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1184 _cleanup_bus_unref_ sd_bus *bus = NULL;
1190 r = sd_bus_default_system(&bus);
1192 log_error("Failed to open system bus: %s", strerror(-r));
1196 if (arg_keep_unit) {
1197 r = sd_bus_call_method(
1199 "org.freedesktop.machine1",
1200 "/org/freedesktop/machine1",
1201 "org.freedesktop.machine1.Manager",
1207 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1211 strempty(arg_directory));
1213 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1215 r = sd_bus_message_new_method_call(
1218 "org.freedesktop.machine1",
1219 "/org/freedesktop/machine1",
1220 "org.freedesktop.machine1.Manager",
1223 log_error("Failed to create message: %s", strerror(-r));
1227 r = sd_bus_message_append(
1231 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1235 strempty(arg_directory));
1237 log_error("Failed to append message arguments: %s", strerror(-r));
1241 r = sd_bus_message_open_container(m, 'a', "(sv)");
1243 log_error("Failed to open container: %s", strerror(-r));
1247 if (!isempty(arg_slice)) {
1248 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1250 log_error("Failed to append slice: %s", strerror(-r));
1255 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1257 log_error("Failed to add device policy: %s", strerror(-r));
1261 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1262 /* Allow the container to
1263 * access and create the API
1264 * device nodes, so that
1265 * PrivateDevices= in the
1266 * container can work
1271 "/dev/random", "rwm",
1272 "/dev/urandom", "rwm",
1274 /* Allow the container
1275 * access to ptys. However,
1277 * container to ever create
1278 * these device nodes. */
1279 "/dev/pts/ptmx", "rw",
1281 /* Allow the container
1282 * access to all kdbus
1283 * devices. Again, the
1284 * container cannot create
1285 * these nodes, only use
1286 * them. We use a pretty
1287 * open match here, so that
1288 * the kernel API can still
1291 "char-kdbus/*", "rw");
1293 log_error("Failed to add device whitelist: %s", strerror(-r));
1297 r = sd_bus_message_close_container(m);
1299 log_error("Failed to close container: %s", strerror(-r));
1303 r = sd_bus_call(bus, m, 0, &error, NULL);
1307 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1314 static int terminate_machine(pid_t pid) {
1315 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1316 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1317 _cleanup_bus_unref_ sd_bus *bus = NULL;
1324 r = sd_bus_default_system(&bus);
1326 log_error("Failed to open system bus: %s", strerror(-r));
1330 r = sd_bus_call_method(
1332 "org.freedesktop.machine1",
1333 "/org/freedesktop/machine1",
1334 "org.freedesktop.machine1.Manager",
1341 /* Note that the machine might already have been
1342 * cleaned up automatically, hence don't consider it a
1343 * failure if we cannot get the machine object. */
1344 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1348 r = sd_bus_message_read(reply, "o", &path);
1350 return bus_log_parse_error(r);
1352 r = sd_bus_call_method(
1354 "org.freedesktop.machine1",
1356 "org.freedesktop.machine1.Machine",
1362 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1369 static int reset_audit_loginuid(void) {
1370 _cleanup_free_ char *p = NULL;
1373 if (arg_share_system)
1376 r = read_one_line_file("/proc/self/loginuid", &p);
1380 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1384 /* Already reset? */
1385 if (streq(p, "4294967295"))
1388 r = write_string_file("/proc/self/loginuid", "4294967295");
1390 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1391 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1392 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1393 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1394 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1402 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1403 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1404 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1407 if (!arg_private_network)
1410 if (!arg_network_veth)
1413 /* Use two different interface name prefixes depending whether
1414 * we are in bridge mode or not. */
1415 if (arg_network_bridge)
1416 memcpy(iface_name, "vb-", 3);
1418 memcpy(iface_name, "ve-", 3);
1420 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1422 r = sd_rtnl_open(&rtnl, 0);
1424 log_error("Failed to connect to netlink: %s", strerror(-r));
1428 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1430 log_error("Failed to allocate netlink message: %s", strerror(-r));
1434 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1436 log_error("Failed to add netlink interface name: %s", strerror(-r));
1440 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1442 log_error("Failed to open netlink container: %s", strerror(-r));
1446 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1448 log_error("Failed to append netlink kind: %s", strerror(-r));
1452 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1454 log_error("Failed to open netlink container: %s", strerror(-r));
1458 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1460 log_error("Failed to open netlink container: %s", strerror(-r));
1464 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1466 log_error("Failed to add netlink interface name: %s", strerror(-r));
1470 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1472 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1476 r = sd_rtnl_message_close_container(m);
1478 log_error("Failed to close netlink container: %s", strerror(-r));
1482 r = sd_rtnl_message_close_container(m);
1484 log_error("Failed to close netlink container: %s", strerror(-r));
1488 r = sd_rtnl_message_close_container(m);
1490 log_error("Failed to close netlink container: %s", strerror(-r));
1494 r = sd_rtnl_call(rtnl, m, 0, NULL);
1496 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1503 static int setup_bridge(const char veth_name[]) {
1504 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1505 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1508 if (!arg_private_network)
1511 if (!arg_network_veth)
1514 if (!arg_network_bridge)
1517 bridge = (int) if_nametoindex(arg_network_bridge);
1519 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1523 r = sd_rtnl_open(&rtnl, 0);
1525 log_error("Failed to connect to netlink: %s", strerror(-r));
1529 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1531 log_error("Failed to allocate netlink message: %s", strerror(-r));
1535 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1537 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1541 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1543 log_error("Failed to add netlink master field: %s", strerror(-r));
1547 r = sd_rtnl_call(rtnl, m, 0, NULL);
1549 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1556 static int parse_interface(struct udev *udev, const char *name) {
1557 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1558 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1561 ifi = (int) if_nametoindex(name);
1563 log_error("Failed to resolve interface %s: %m", name);
1567 sprintf(ifi_str, "n%i", ifi);
1568 d = udev_device_new_from_device_id(udev, ifi_str);
1570 log_error("Failed to get udev device for interface %s: %m", name);
1574 if (udev_device_get_is_initialized(d) <= 0) {
1575 log_error("Network interface %s is not initialized yet.", name);
1582 static int move_network_interfaces(pid_t pid) {
1583 _cleanup_udev_unref_ struct udev *udev = NULL;
1584 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1588 if (!arg_private_network)
1591 if (strv_isempty(arg_network_interfaces))
1594 r = sd_rtnl_open(&rtnl, 0);
1596 log_error("Failed to connect to netlink: %s", strerror(-r));
1602 log_error("Failed to connect to udev.");
1606 STRV_FOREACH(i, arg_network_interfaces) {
1607 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1610 ifi = parse_interface(udev, *i);
1614 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1616 log_error("Failed to allocate netlink message: %s", strerror(-r));
1620 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1622 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1626 r = sd_rtnl_call(rtnl, m, 0, NULL);
1628 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1636 static int setup_macvlan(pid_t pid) {
1637 _cleanup_udev_unref_ struct udev *udev = NULL;
1638 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1642 if (!arg_private_network)
1645 if (strv_isempty(arg_network_macvlan))
1648 r = sd_rtnl_open(&rtnl, 0);
1650 log_error("Failed to connect to netlink: %s", strerror(-r));
1656 log_error("Failed to connect to udev.");
1660 STRV_FOREACH(i, arg_network_macvlan) {
1661 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1662 _cleanup_free_ char *n = NULL;
1665 ifi = parse_interface(udev, *i);
1669 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1671 log_error("Failed to allocate netlink message: %s", strerror(-r));
1675 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1677 log_error("Failed to add netlink interface index: %s", strerror(-r));
1681 n = strappend("mv-", *i);
1685 strshorten(n, IFNAMSIZ-1);
1687 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1689 log_error("Failed to add netlink interface name: %s", strerror(-r));
1693 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1695 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1699 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1701 log_error("Failed to open netlink container: %s", strerror(-r));
1705 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1707 log_error("Failed to append netlink kind: %s", strerror(-r));
1711 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1713 log_error("Failed to open netlink container: %s", strerror(-r));
1717 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1719 log_error("Failed to append macvlan mode: %s", strerror(-r));
1723 r = sd_rtnl_message_close_container(m);
1725 log_error("Failed to close netlink container: %s", strerror(-r));
1729 r = sd_rtnl_message_close_container(m);
1731 log_error("Failed to close netlink container: %s", strerror(-r));
1735 r = sd_rtnl_call(rtnl, m, 0, NULL);
1737 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1745 static int audit_still_doesnt_work_in_containers(void) {
1748 scmp_filter_ctx seccomp;
1752 Audit is broken in containers, much of the userspace audit
1753 hookup will fail if running inside a container. We don't
1754 care and just turn off creation of audit sockets.
1756 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1757 with EAFNOSUPPORT which audit userspace uses as indication
1758 that audit is disabled in the kernel.
1761 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1765 r = seccomp_add_secondary_archs(seccomp);
1767 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1771 r = seccomp_rule_add(
1773 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1776 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1777 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1779 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1783 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1785 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1789 r = seccomp_load(seccomp);
1791 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1794 seccomp_release(seccomp);
1802 static int setup_image(char **device_path, int *loop_nr) {
1803 struct loop_info64 info = {
1804 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1806 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1807 _cleanup_free_ char* loopdev = NULL;
1811 assert(device_path);
1814 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1816 log_error("Failed to open %s: %m", arg_image);
1820 if (fstat(fd, &st) < 0) {
1821 log_error("Failed to stat %s: %m", arg_image);
1825 if (S_ISBLK(st.st_mode)) {
1828 p = strdup(arg_image);
1842 if (!S_ISREG(st.st_mode)) {
1843 log_error("%s is not a regular file or block device: %m", arg_image);
1847 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1849 log_error("Failed to open /dev/loop-control: %m");
1853 nr = ioctl(control, LOOP_CTL_GET_FREE);
1855 log_error("Failed to allocate loop device: %m");
1859 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1862 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1864 log_error("Failed to open loop device %s: %m", loopdev);
1868 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1869 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1874 info.lo_flags |= LO_FLAGS_READ_ONLY;
1876 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1877 log_error("Failed to set loopback settings on %s: %m", loopdev);
1881 *device_path = loopdev;
1892 static int dissect_image(
1894 char **root_device, bool *root_device_rw,
1895 char **home_device, bool *home_device_rw,
1896 char **srv_device, bool *srv_device_rw,
1900 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1901 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1902 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1903 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1904 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1905 _cleanup_udev_unref_ struct udev *udev = NULL;
1906 struct udev_list_entry *first, *item;
1907 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1908 const char *pttype = NULL;
1914 assert(root_device);
1915 assert(home_device);
1919 b = blkid_new_probe();
1924 r = blkid_probe_set_device(b, fd, 0, 0);
1929 log_error("Failed to set device on blkid probe: %m");
1933 blkid_probe_enable_partitions(b, 1);
1934 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1937 r = blkid_do_safeprobe(b);
1938 if (r == -2 || r == 1) {
1939 log_error("Failed to identify any partition table on %s.\n"
1940 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1942 } else if (r != 0) {
1945 log_error("Failed to probe: %m");
1949 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1950 if (!streq_ptr(pttype, "gpt")) {
1951 log_error("Image %s does not carry a GUID Partition Table.\n"
1952 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1957 pl = blkid_probe_get_partitions(b);
1962 log_error("Failed to list partitions of %s", arg_image);
1970 if (fstat(fd, &st) < 0) {
1971 log_error("Failed to stat block device: %m");
1975 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1979 e = udev_enumerate_new(udev);
1983 r = udev_enumerate_add_match_parent(e, d);
1987 r = udev_enumerate_scan_devices(e);
1989 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
1993 first = udev_enumerate_get_list_entry(e);
1994 udev_list_entry_foreach(item, first) {
1995 _cleanup_udev_device_unref_ struct udev_device *q;
1996 const char *stype, *node;
1997 unsigned long long flags;
2004 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2009 log_error("Failed to get partition device of %s: %m", arg_image);
2013 qn = udev_device_get_devnum(q);
2017 if (st.st_rdev == qn)
2020 node = udev_device_get_devnode(q);
2024 pp = blkid_partlist_devno_to_partition(pl, qn);
2028 flags = blkid_partition_get_flags(pp);
2029 if (flags & GPT_FLAG_NO_AUTO)
2032 nr = blkid_partition_get_partno(pp);
2036 stype = blkid_partition_get_type_string(pp);
2040 if (sd_id128_from_string(stype, &type_id) < 0)
2043 if (sd_id128_equal(type_id, GPT_HOME)) {
2045 if (home && nr >= home_nr)
2049 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2052 home = strdup(node);
2055 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2057 if (srv && nr >= srv_nr)
2061 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2068 #ifdef GPT_ROOT_NATIVE
2069 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2071 if (root && nr >= root_nr)
2075 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2078 root = strdup(node);
2083 #ifdef GPT_ROOT_SECONDARY
2084 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2086 if (secondary_root && nr >= secondary_root_nr)
2089 secondary_root_nr = nr;
2090 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2093 free(secondary_root);
2094 secondary_root = strdup(node);
2095 if (!secondary_root)
2101 if (!root && !secondary_root) {
2102 log_error("Failed to identify root partition in disk image %s.\n"
2103 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2108 *root_device = root;
2111 *root_device_rw = root_rw;
2113 } else if (secondary_root) {
2114 *root_device = secondary_root;
2115 secondary_root = NULL;
2117 *root_device_rw = secondary_root_rw;
2122 *home_device = home;
2125 *home_device_rw = home_rw;
2132 *srv_device_rw = srv_rw;
2137 log_error("--image= is not supported, compiled without blkid support.");
2142 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2144 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2145 const char *fstype, *p;
2155 p = strappenda(where, directory);
2160 b = blkid_new_probe_from_filename(what);
2164 log_error("Failed to allocate prober for %s: %m", what);
2168 blkid_probe_enable_superblocks(b, 1);
2169 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2172 r = blkid_do_safeprobe(b);
2173 if (r == -1 || r == 1) {
2174 log_error("Cannot determine file system type of %s", what);
2176 } else if (r != 0) {
2179 log_error("Failed to probe %s: %m", what);
2184 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2187 log_error("Failed to determine file system type of %s", what);
2191 if (streq(fstype, "crypto_LUKS")) {
2192 log_error("nspawn currently does not support LUKS disk images.");
2196 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2197 log_error("Failed to mount %s: %m", what);
2203 log_error("--image= is not supported, compiled without blkid support.");
2208 static int mount_devices(
2210 const char *root_device, bool root_device_rw,
2211 const char *home_device, bool home_device_rw,
2212 const char *srv_device, bool srv_device_rw) {
2218 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2220 log_error("Failed to mount root directory: %s", strerror(-r));
2226 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2228 log_error("Failed to mount home directory: %s", strerror(-r));
2234 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2236 log_error("Failed to mount server data directory: %s", strerror(-r));
2244 static void loop_remove(int nr, int *image_fd) {
2245 _cleanup_close_ int control = -1;
2250 if (image_fd && *image_fd >= 0) {
2251 ioctl(*image_fd, LOOP_CLR_FD);
2252 close_nointr_nofail(*image_fd);
2256 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2260 ioctl(control, LOOP_CTL_REMOVE, nr);
2263 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2271 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2272 log_error("Failed to allocate pipe: %m");
2278 log_error("Failed to fork getent child: %m");
2280 } else if (pid == 0) {
2282 char *empty_env = NULL;
2284 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2285 _exit(EXIT_FAILURE);
2287 if (pipe_fds[0] > 2)
2288 close_nointr_nofail(pipe_fds[0]);
2289 if (pipe_fds[1] > 2)
2290 close_nointr_nofail(pipe_fds[1]);
2292 nullfd = open("/dev/null", O_RDWR);
2294 _exit(EXIT_FAILURE);
2296 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2297 _exit(EXIT_FAILURE);
2299 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2300 _exit(EXIT_FAILURE);
2303 close_nointr_nofail(nullfd);
2305 reset_all_signal_handlers();
2306 close_all_fds(NULL, 0);
2308 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2309 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2310 _exit(EXIT_FAILURE);
2313 close_nointr_nofail(pipe_fds[1]);
2321 static int change_uid_gid(char **_home) {
2323 _cleanup_strv_free_ char **passwd = NULL;
2324 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2325 _cleanup_free_ uid_t *uids = NULL;
2326 _cleanup_free_ char *home = NULL;
2327 _cleanup_fclose_ FILE *f = NULL;
2328 _cleanup_close_ int fd = -1;
2329 unsigned n_uids = 0;
2338 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2339 /* Reset everything fully to 0, just in case */
2341 if (setgroups(0, NULL) < 0) {
2342 log_error("setgroups() failed: %m");
2346 if (setresgid(0, 0, 0) < 0) {
2347 log_error("setregid() failed: %m");
2351 if (setresuid(0, 0, 0) < 0) {
2352 log_error("setreuid() failed: %m");
2360 /* First, get user credentials */
2361 fd = spawn_getent("passwd", arg_user, &pid);
2365 f = fdopen(fd, "r");
2370 if (!fgets(line, sizeof(line), f)) {
2373 log_error("Failed to resolve user %s.", arg_user);
2377 log_error("Failed to read from getent: %m");
2383 wait_for_terminate_and_warn("getent passwd", pid);
2385 x = strchr(line, ':');
2387 log_error("/etc/passwd entry has invalid user field.");
2391 u = strchr(x+1, ':');
2393 log_error("/etc/passwd entry has invalid password field.");
2400 log_error("/etc/passwd entry has invalid UID field.");
2408 log_error("/etc/passwd entry has invalid GID field.");
2413 h = strchr(x+1, ':');
2415 log_error("/etc/passwd entry has invalid GECOS field.");
2422 log_error("/etc/passwd entry has invalid home directory field.");
2428 r = parse_uid(u, &uid);
2430 log_error("Failed to parse UID of user.");
2434 r = parse_gid(g, &gid);
2436 log_error("Failed to parse GID of user.");
2444 /* Second, get group memberships */
2445 fd = spawn_getent("initgroups", arg_user, &pid);
2450 f = fdopen(fd, "r");
2455 if (!fgets(line, sizeof(line), f)) {
2457 log_error("Failed to resolve user %s.", arg_user);
2461 log_error("Failed to read from getent: %m");
2467 wait_for_terminate_and_warn("getent initgroups", pid);
2469 /* Skip over the username and subsequent separator whitespace */
2471 x += strcspn(x, WHITESPACE);
2472 x += strspn(x, WHITESPACE);
2474 FOREACH_WORD(w, l, x, state) {
2480 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2483 r = parse_uid(c, &uids[n_uids++]);
2485 log_error("Failed to parse group data from getent.");
2490 r = mkdir_parents(home, 0775);
2492 log_error("Failed to make home root directory: %s", strerror(-r));
2496 r = mkdir_safe(home, 0755, uid, gid);
2498 log_error("Failed to make home directory: %s", strerror(-r));
2502 fchown(STDIN_FILENO, uid, gid);
2503 fchown(STDOUT_FILENO, uid, gid);
2504 fchown(STDERR_FILENO, uid, gid);
2506 if (setgroups(n_uids, uids) < 0) {
2507 log_error("Failed to set auxiliary groups: %m");
2511 if (setresgid(gid, gid, gid) < 0) {
2512 log_error("setregid() failed: %m");
2516 if (setresuid(uid, uid, uid) < 0) {
2517 log_error("setreuid() failed: %m");
2529 int main(int argc, char *argv[]) {
2531 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2532 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2533 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2534 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
2535 _cleanup_fdset_free_ FDSet *fds = NULL;
2536 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2537 const char *console = NULL;
2538 char veth_name[IFNAMSIZ];
2539 bool secondary = false;
2543 log_parse_environment();
2546 k = parse_argv(argc, argv);
2555 if (arg_directory) {
2558 p = path_make_absolute_cwd(arg_directory);
2559 free(arg_directory);
2562 arg_directory = get_current_dir_name();
2564 if (!arg_directory) {
2565 log_error("Failed to determine path, please use -D.");
2568 path_kill_slashes(arg_directory);
2572 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2578 hostname_cleanup(arg_machine, false);
2579 if (isempty(arg_machine)) {
2580 log_error("Failed to determine machine name automatically, please use -M.");
2585 if (geteuid() != 0) {
2586 log_error("Need to be root.");
2590 if (sd_booted() <= 0) {
2591 log_error("Not running on a systemd system.");
2596 n_fd_passed = sd_listen_fds(false);
2597 if (n_fd_passed > 0) {
2598 k = fdset_new_listen_fds(&fds, false);
2600 log_error("Failed to collect file descriptors: %s", strerror(-k));
2604 fdset_close_others(fds);
2607 if (arg_directory) {
2608 if (path_equal(arg_directory, "/")) {
2609 log_error("Spawning container on root directory not supported.");
2614 if (path_is_os_tree(arg_directory) <= 0) {
2615 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2621 p = strappenda(arg_directory,
2622 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2623 if (access(p, F_OK) < 0) {
2624 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2630 char template[] = "/tmp/nspawn-root-XXXXXX";
2632 if (!mkdtemp(template)) {
2633 log_error("Failed to create temporary directory: %m");
2638 arg_directory = strdup(template);
2639 if (!arg_directory) {
2644 image_fd = setup_image(&device_path, &loop_nr);
2650 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2655 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2657 log_error("Failed to acquire pseudo tty: %m");
2661 console = ptsname(master);
2663 log_error("Failed to determine tty name: %m");
2668 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2670 if (unlockpt(master) < 0) {
2671 log_error("Failed to unlock tty: %m");
2675 if (access("/dev/kdbus/control", F_OK) >= 0) {
2677 if (arg_share_system) {
2678 kdbus_domain = strdup("/dev/kdbus");
2679 if (!kdbus_domain) {
2686 ns = strappenda("machine-", arg_machine);
2687 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2689 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2691 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2695 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2696 log_error("Failed to create kmsg socket pair: %m");
2700 sd_notify(0, "READY=1");
2702 assert_se(sigemptyset(&mask) == 0);
2703 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2704 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2707 int parent_ready_fd = -1, child_ready_fd = -1;
2711 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2712 if (parent_ready_fd < 0) {
2713 log_error("Failed to create event fd: %m");
2717 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2718 if (child_ready_fd < 0) {
2719 log_error("Failed to create event fd: %m");
2723 pid = syscall(__NR_clone,
2724 SIGCHLD|CLONE_NEWNS|
2725 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2726 (arg_private_network ? CLONE_NEWNET : 0), NULL);
2728 if (errno == EINVAL)
2729 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2731 log_error("clone() failed: %m");
2738 _cleanup_free_ char *home = NULL;
2740 const char *envp[] = {
2741 "PATH=" DEFAULT_PATH_SPLIT_USR,
2742 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2747 NULL, /* container_uuid */
2748 NULL, /* LISTEN_FDS */
2749 NULL, /* LISTEN_PID */
2754 envp[n_env] = strv_find_prefix(environ, "TERM=");
2758 close_nointr_nofail(master);
2761 close_nointr(STDIN_FILENO);
2762 close_nointr(STDOUT_FILENO);
2763 close_nointr(STDERR_FILENO);
2765 close_nointr_nofail(kmsg_socket_pair[0]);
2766 kmsg_socket_pair[0] = -1;
2768 reset_all_signal_handlers();
2770 assert_se(sigemptyset(&mask) == 0);
2771 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2773 k = open_terminal(console, O_RDWR);
2774 if (k != STDIN_FILENO) {
2776 close_nointr_nofail(k);
2780 log_error("Failed to open console: %s", strerror(-k));
2784 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2785 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2786 log_error("Failed to duplicate console: %m");
2791 log_error("setsid() failed: %m");
2795 if (reset_audit_loginuid() < 0)
2798 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2799 log_error("PR_SET_PDEATHSIG failed: %m");
2803 /* Mark everything as slave, so that we still
2804 * receive mounts from the real root, but don't
2805 * propagate mounts to the real root. */
2806 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2807 log_error("MS_SLAVE|MS_REC failed: %m");
2811 if (mount_devices(arg_directory,
2812 root_device, root_device_rw,
2813 home_device, home_device_rw,
2814 srv_device, srv_device_rw) < 0)
2817 /* Turn directory into bind mount */
2818 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2819 log_error("Failed to make bind mount.");
2824 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2825 log_error("Failed to make read-only.");
2829 if (mount_all(arg_directory) < 0)
2832 if (copy_devnodes(arg_directory) < 0)
2835 if (setup_ptmx(arg_directory) < 0)
2838 dev_setup(arg_directory);
2840 if (audit_still_doesnt_work_in_containers() < 0)
2843 if (setup_dev_console(arg_directory, console) < 0)
2846 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2849 close_nointr_nofail(kmsg_socket_pair[1]);
2850 kmsg_socket_pair[1] = -1;
2852 if (setup_boot_id(arg_directory) < 0)
2855 if (setup_timezone(arg_directory) < 0)
2858 if (setup_resolv_conf(arg_directory) < 0)
2861 if (setup_journal(arg_directory) < 0)
2864 if (mount_binds(arg_directory, arg_bind, 0) < 0)
2867 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2870 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2873 /* Tell the parent that we are ready, and that
2874 * it can cgroupify us to that we lack access
2875 * to certain devices and resources. */
2876 eventfd_write(child_ready_fd, 1);
2877 close_nointr_nofail(child_ready_fd);
2878 child_ready_fd = -1;
2880 if (chdir(arg_directory) < 0) {
2881 log_error("chdir(%s) failed: %m", arg_directory);
2885 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2886 log_error("mount(MS_MOVE) failed: %m");
2890 if (chroot(".") < 0) {
2891 log_error("chroot() failed: %m");
2895 if (chdir("/") < 0) {
2896 log_error("chdir() failed: %m");
2902 if (arg_private_network)
2905 if (drop_capabilities() < 0) {
2906 log_error("drop_capabilities() failed: %m");
2910 r = change_uid_gid(&home);
2914 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2915 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2916 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2921 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2922 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2928 if (fdset_size(fds) > 0) {
2929 k = fdset_cloexec(fds, false);
2931 log_error("Failed to unset O_CLOEXEC for file descriptors.");
2935 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2936 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2944 if (arg_personality != 0xffffffffLU) {
2945 if (personality(arg_personality) < 0) {
2946 log_error("personality() failed: %m");
2949 } else if (secondary) {
2950 if (personality(PER_LINUX32) < 0) {
2951 log_error("personality() failed: %m");
2957 if (arg_selinux_context)
2958 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
2959 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2964 if (!strv_isempty(arg_setenv)) {
2967 n = strv_env_merge(2, envp, arg_setenv);
2975 env_use = (char**) envp;
2977 /* Wait until the parent is ready with the setup, too... */
2978 eventfd_read(parent_ready_fd, &x);
2979 close_nointr_nofail(parent_ready_fd);
2980 parent_ready_fd = -1;
2986 /* Automatically search for the init system */
2988 l = 1 + argc - optind;
2989 a = newa(char*, l + 1);
2990 memcpy(a + 1, argv + optind, l * sizeof(char*));
2992 a[0] = (char*) "/usr/lib/systemd/systemd";
2993 execve(a[0], a, env_use);
2995 a[0] = (char*) "/lib/systemd/systemd";
2996 execve(a[0], a, env_use);
2998 a[0] = (char*) "/sbin/init";
2999 execve(a[0], a, env_use);
3000 } else if (argc > optind)
3001 execvpe(argv[optind], argv + optind, env_use);
3003 chdir(home ? home : "/root");
3004 execle("/bin/bash", "-bash", NULL, env_use);
3005 execle("/bin/sh", "-sh", NULL, env_use);
3008 log_error("execv() failed: %m");
3011 _exit(EXIT_FAILURE);
3017 /* Wait until the child reported that it is ready with
3018 * all it needs to do with priviliges. After we got
3019 * the notification we can make the process join its
3020 * cgroup which might limit what it can do */
3021 eventfd_read(child_ready_fd, &x);
3023 r = register_machine(pid);
3027 r = move_network_interfaces(pid);
3031 r = setup_veth(pid, veth_name);
3035 r = setup_bridge(veth_name);
3039 r = setup_macvlan(pid);
3043 /* Notify the child that the parent is ready with all
3044 * its setup, and thtat the child can now hand over
3045 * control to the code to run inside the container. */
3046 eventfd_write(parent_ready_fd, 1);
3048 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3057 /* Kill if it is not dead yet anyway */
3058 terminate_machine(pid);
3060 /* Redundant, but better safe than sorry */
3063 k = wait_for_terminate(pid, &status);
3071 if (status.si_code == CLD_EXITED) {
3072 r = status.si_status;
3073 if (status.si_status != 0) {
3074 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3079 log_debug("Container %s exited successfully.", arg_machine);
3081 } else if (status.si_code == CLD_KILLED &&
3082 status.si_status == SIGINT) {
3085 log_info("Container %s has been shut down.", arg_machine);
3088 } else if (status.si_code == CLD_KILLED &&
3089 status.si_status == SIGHUP) {
3092 log_info("Container %s is being rebooted.", arg_machine);
3094 } else if (status.si_code == CLD_KILLED ||
3095 status.si_code == CLD_DUMPED) {
3097 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3101 log_error("Container %s failed due to unknown reason.", arg_machine);
3108 loop_remove(loop_nr, &image_fd);
3113 free(arg_directory);
3116 strv_free(arg_setenv);
3117 strv_free(arg_network_interfaces);
3118 strv_free(arg_network_macvlan);
3119 strv_free(arg_bind);
3120 strv_free(arg_bind_ro);