1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
89 #include "siphash24.h"
92 #include "seccomp-util.h"
95 typedef enum LinkJournal {
102 static char *arg_directory = NULL;
103 static char *arg_user = NULL;
104 static sd_id128_t arg_uuid = {};
105 static char *arg_machine = NULL;
106 static const char *arg_selinux_context = NULL;
107 static const char *arg_selinux_apifs_context = NULL;
108 static const char *arg_slice = NULL;
109 static bool arg_private_network = false;
110 static bool arg_read_only = false;
111 static bool arg_boot = false;
112 static LinkJournal arg_link_journal = LINK_AUTO;
113 static uint64_t arg_retain =
114 (1ULL << CAP_CHOWN) |
115 (1ULL << CAP_DAC_OVERRIDE) |
116 (1ULL << CAP_DAC_READ_SEARCH) |
117 (1ULL << CAP_FOWNER) |
118 (1ULL << CAP_FSETID) |
119 (1ULL << CAP_IPC_OWNER) |
121 (1ULL << CAP_LEASE) |
122 (1ULL << CAP_LINUX_IMMUTABLE) |
123 (1ULL << CAP_NET_BIND_SERVICE) |
124 (1ULL << CAP_NET_BROADCAST) |
125 (1ULL << CAP_NET_RAW) |
126 (1ULL << CAP_SETGID) |
127 (1ULL << CAP_SETFCAP) |
128 (1ULL << CAP_SETPCAP) |
129 (1ULL << CAP_SETUID) |
130 (1ULL << CAP_SYS_ADMIN) |
131 (1ULL << CAP_SYS_CHROOT) |
132 (1ULL << CAP_SYS_NICE) |
133 (1ULL << CAP_SYS_PTRACE) |
134 (1ULL << CAP_SYS_TTY_CONFIG) |
135 (1ULL << CAP_SYS_RESOURCE) |
136 (1ULL << CAP_SYS_BOOT) |
137 (1ULL << CAP_AUDIT_WRITE) |
138 (1ULL << CAP_AUDIT_CONTROL) |
140 static char **arg_bind = NULL;
141 static char **arg_bind_ro = NULL;
142 static char **arg_setenv = NULL;
143 static bool arg_quiet = false;
144 static bool arg_share_system = false;
145 static bool arg_register = true;
146 static bool arg_keep_unit = false;
147 static char **arg_network_interfaces = NULL;
148 static char **arg_network_macvlan = NULL;
149 static bool arg_network_veth = false;
150 static const char *arg_network_bridge = NULL;
151 static unsigned long arg_personality = 0xffffffffLU;
152 static const char *arg_image = NULL;
154 static int help(void) {
156 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
157 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
158 " -h --help Show this help\n"
159 " --version Print version string\n"
160 " -q --quiet Do not show status information\n"
161 " -D --directory=PATH Root directory for the container\n"
162 " -i --image=PATH File system device or image for the container\n"
163 " -b --boot Boot up full system (i.e. invoke init)\n"
164 " -u --user=USER Run the command under specified user or uid\n"
165 " -M --machine=NAME Set the machine name for the container\n"
166 " --uuid=UUID Set a specific machine UUID for the container\n"
167 " -S --slice=SLICE Place the container in the specified slice\n"
168 " --private-network Disable network in container\n"
169 " --network-interface=INTERFACE\n"
170 " Assign an existing network interface to the\n"
172 " --network-macvlan=INTERFACE\n"
173 " Create a macvlan network interface based on an\n"
174 " existing network interface to the container\n"
175 " --network-veth Add a virtual ethernet connection between host\n"
177 " --network-bridge=INTERFACE\n"
178 " Add a virtual ethernet connection between host\n"
179 " and container and add it to an existing bridge on\n"
181 " -Z --selinux-context=SECLABEL\n"
182 " Set the SELinux security context to be used by\n"
183 " processes in the container\n"
184 " -L --selinux-apifs-context=SECLABEL\n"
185 " Set the SELinux security context to be used by\n"
186 " API/tmpfs file systems in the container\n"
187 " --capability=CAP In addition to the default, retain specified\n"
189 " --drop-capability=CAP Drop the specified capability from the default set\n"
190 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
191 " -j Equivalent to --link-journal=host\n"
192 " --read-only Mount the root directory read-only\n"
193 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
195 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
196 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
197 " --share-system Share system namespaces with host\n"
198 " --register=BOOLEAN Register container as machine\n"
199 " --keep-unit Do not register a scope for the machine, reuse\n"
200 " the service unit nspawn is running in\n",
201 program_invocation_short_name);
206 static int parse_argv(int argc, char *argv[]) {
222 ARG_NETWORK_INTERFACE,
229 static const struct option options[] = {
230 { "help", no_argument, NULL, 'h' },
231 { "version", no_argument, NULL, ARG_VERSION },
232 { "directory", required_argument, NULL, 'D' },
233 { "user", required_argument, NULL, 'u' },
234 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
235 { "boot", no_argument, NULL, 'b' },
236 { "uuid", required_argument, NULL, ARG_UUID },
237 { "read-only", no_argument, NULL, ARG_READ_ONLY },
238 { "capability", required_argument, NULL, ARG_CAPABILITY },
239 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
240 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
241 { "bind", required_argument, NULL, ARG_BIND },
242 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
243 { "machine", required_argument, NULL, 'M' },
244 { "slice", required_argument, NULL, 'S' },
245 { "setenv", required_argument, NULL, ARG_SETENV },
246 { "selinux-context", required_argument, NULL, 'Z' },
247 { "selinux-apifs-context", required_argument, NULL, 'L' },
248 { "quiet", no_argument, NULL, 'q' },
249 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
250 { "register", required_argument, NULL, ARG_REGISTER },
251 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
252 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
253 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
254 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
255 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
256 { "personality", required_argument, NULL, ARG_PERSONALITY },
257 { "image", required_argument, NULL, 'i' },
262 uint64_t plus = 0, minus = 0;
267 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
275 puts(PACKAGE_STRING);
276 puts(SYSTEMD_FEATURES);
281 arg_directory = canonicalize_file_name(optarg);
282 if (!arg_directory) {
283 log_error("Invalid root directory: %m");
295 arg_user = strdup(optarg);
301 case ARG_NETWORK_BRIDGE:
302 arg_network_bridge = optarg;
306 case ARG_NETWORK_VETH:
307 arg_network_veth = true;
308 arg_private_network = true;
311 case ARG_NETWORK_INTERFACE:
312 if (strv_extend(&arg_network_interfaces, optarg) < 0)
315 arg_private_network = true;
318 case ARG_NETWORK_MACVLAN:
319 if (strv_extend(&arg_network_macvlan, optarg) < 0)
324 case ARG_PRIVATE_NETWORK:
325 arg_private_network = true;
333 r = sd_id128_from_string(optarg, &arg_uuid);
335 log_error("Invalid UUID: %s", optarg);
345 if (isempty(optarg)) {
350 if (!hostname_is_valid(optarg)) {
351 log_error("Invalid machine name: %s", optarg);
356 arg_machine = strdup(optarg);
364 arg_selinux_context = optarg;
368 arg_selinux_apifs_context = optarg;
372 arg_read_only = true;
376 case ARG_DROP_CAPABILITY: {
380 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
381 _cleanup_free_ char *t;
384 t = strndup(word, length);
388 if (streq(t, "all")) {
389 if (c == ARG_CAPABILITY)
390 plus = (uint64_t) -1;
392 minus = (uint64_t) -1;
394 if (cap_from_name(t, &cap) < 0) {
395 log_error("Failed to parse capability %s.", t);
399 if (c == ARG_CAPABILITY)
400 plus |= 1ULL << (uint64_t) cap;
402 minus |= 1ULL << (uint64_t) cap;
410 arg_link_journal = LINK_GUEST;
413 case ARG_LINK_JOURNAL:
414 if (streq(optarg, "auto"))
415 arg_link_journal = LINK_AUTO;
416 else if (streq(optarg, "no"))
417 arg_link_journal = LINK_NO;
418 else if (streq(optarg, "guest"))
419 arg_link_journal = LINK_GUEST;
420 else if (streq(optarg, "host"))
421 arg_link_journal = LINK_HOST;
423 log_error("Failed to parse link journal mode %s", optarg);
431 _cleanup_free_ char *a = NULL, *b = NULL;
435 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
437 e = strchr(optarg, ':');
439 a = strndup(optarg, e - optarg);
449 if (!path_is_absolute(a) || !path_is_absolute(b)) {
450 log_error("Invalid bind mount specification: %s", optarg);
454 r = strv_extend(x, a);
458 r = strv_extend(x, b);
468 if (!env_assignment_is_valid(optarg)) {
469 log_error("Environment variable assignment '%s' is not valid.", optarg);
473 n = strv_env_set(arg_setenv, optarg);
477 strv_free(arg_setenv);
486 case ARG_SHARE_SYSTEM:
487 arg_share_system = true;
491 r = parse_boolean(optarg);
493 log_error("Failed to parse --register= argument: %s", optarg);
501 arg_keep_unit = true;
504 case ARG_PERSONALITY:
506 arg_personality = personality_from_string(optarg);
507 if (arg_personality == 0xffffffffLU) {
508 log_error("Unknown or unsupported personality '%s'.", optarg);
518 assert_not_reached("Unhandled option");
522 if (arg_share_system)
523 arg_register = false;
525 if (arg_boot && arg_share_system) {
526 log_error("--boot and --share-system may not be combined.");
530 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
531 log_error("--keep-unit may not be used when invoked from a user session.");
535 if (arg_directory && arg_image) {
536 log_error("--directory= and --image= may not be combined.");
540 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
545 static int mount_all(const char *dest) {
547 typedef struct MountPoint {
556 static const MountPoint mount_table[] = {
557 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
558 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
559 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
560 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
561 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
562 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
563 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
564 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
566 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
567 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
574 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575 _cleanup_free_ char *where = NULL;
577 _cleanup_free_ char *options = NULL;
582 where = strjoin(dest, "/", mount_table[k].where, NULL);
586 t = path_is_mount_point(where, true);
588 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
596 /* Skip this entry if it is not a remount. */
597 if (mount_table[k].what && t > 0)
600 mkdir_p(where, 0755);
603 if (arg_selinux_apifs_context &&
604 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
605 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
612 o = mount_table[k].options;
615 if (mount(mount_table[k].what,
618 mount_table[k].flags,
620 mount_table[k].fatal) {
622 log_error("mount(%s) failed: %m", where);
632 static int mount_binds(const char *dest, char **l, unsigned long flags) {
635 STRV_FOREACH_PAIR(x, y, l) {
637 struct stat source_st, dest_st;
640 if (stat(*x, &source_st) < 0) {
641 log_error("Failed to stat %s: %m", *x);
645 where = strappenda(dest, *y);
646 r = stat(where, &dest_st);
648 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
649 log_error("The file types of %s and %s do not match. Refusing bind mount",
653 } else if (errno == ENOENT) {
654 r = mkdir_parents_label(where, 0755);
656 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
660 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
663 /* Create the mount point, but be conservative -- refuse to create block
664 * and char devices. */
665 if (S_ISDIR(source_st.st_mode))
666 mkdir_label(where, 0755);
667 else if (S_ISFIFO(source_st.st_mode))
669 else if (S_ISSOCK(source_st.st_mode))
670 mknod(where, 0644 | S_IFSOCK, 0);
671 else if (S_ISREG(source_st.st_mode))
674 log_error("Refusing to create mountpoint for file: %s", *x);
678 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
679 log_error("mount(%s) failed: %m", where);
683 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
684 log_error("mount(%s) failed: %m", where);
692 static int setup_timezone(const char *dest) {
693 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
699 /* Fix the timezone, if possible */
700 r = readlink_malloc("/etc/localtime", &p);
702 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
706 z = path_startswith(p, "../usr/share/zoneinfo/");
708 z = path_startswith(p, "/usr/share/zoneinfo/");
710 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
714 where = strappend(dest, "/etc/localtime");
718 r = readlink_malloc(where, &q);
720 y = path_startswith(q, "../usr/share/zoneinfo/");
722 y = path_startswith(q, "/usr/share/zoneinfo/");
725 /* Already pointing to the right place? Then do nothing .. */
726 if (y && streq(y, z))
730 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
734 if (access(check, F_OK) < 0) {
735 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
739 what = strappend("../usr/share/zoneinfo/", z);
744 if (symlink(what, where) < 0) {
745 log_error("Failed to correct timezone of container: %m");
752 static int setup_resolv_conf(const char *dest) {
753 char _cleanup_free_ *where = NULL;
757 if (arg_private_network)
760 /* Fix resolv.conf, if possible */
761 where = strappend(dest, "/etc/resolv.conf");
765 /* We don't really care for the results of this really. If it
766 * fails, it fails, but meh... */
767 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
772 static int setup_boot_id(const char *dest) {
773 _cleanup_free_ char *from = NULL, *to = NULL;
780 if (arg_share_system)
783 /* Generate a new randomized boot ID, so that each boot-up of
784 * the container gets a new one */
786 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
787 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
791 r = sd_id128_randomize(&rnd);
793 log_error("Failed to generate random boot id: %s", strerror(-r));
797 snprintf(as_uuid, sizeof(as_uuid),
798 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
799 SD_ID128_FORMAT_VAL(rnd));
800 char_array_0(as_uuid);
802 r = write_string_file(from, as_uuid);
804 log_error("Failed to write boot id: %s", strerror(-r));
808 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
809 log_error("Failed to bind mount boot id: %m");
811 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
812 log_warning("Failed to make boot id read-only: %m");
818 static int copy_devnodes(const char *dest) {
820 static const char devnodes[] =
830 _cleanup_umask_ mode_t u;
836 NULSTR_FOREACH(d, devnodes) {
837 _cleanup_free_ char *from = NULL, *to = NULL;
840 from = strappend("/dev/", d);
841 to = strjoin(dest, "/dev/", d, NULL);
845 if (stat(from, &st) < 0) {
847 if (errno != ENOENT) {
848 log_error("Failed to stat %s: %m", from);
852 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
854 log_error("%s is not a char or block device, cannot copy", from);
857 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
859 log_error("mknod(%s) failed: %m", dest);
867 static int setup_ptmx(const char *dest) {
868 _cleanup_free_ char *p = NULL;
870 p = strappend(dest, "/dev/ptmx");
874 if (symlink("pts/ptmx", p) < 0) {
875 log_error("Failed to create /dev/ptmx symlink: %m");
882 static int setup_dev_console(const char *dest, const char *console) {
883 _cleanup_umask_ mode_t u;
893 if (stat("/dev/null", &st) < 0) {
894 log_error("Failed to stat /dev/null: %m");
898 r = chmod_and_chown(console, 0600, 0, 0);
900 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
904 /* We need to bind mount the right tty to /dev/console since
905 * ptys can only exist on pts file systems. To have something
906 * to bind mount things on we create a device node first, and
907 * use /dev/null for that since we the cgroups device policy
908 * allows us to create that freely, while we cannot create
909 * /dev/console. (Note that the major minor doesn't actually
910 * matter here, since we mount it over anyway). */
912 to = strappenda(dest, "/dev/console");
913 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
914 log_error("mknod() for /dev/console failed: %m");
918 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
919 log_error("Bind mount for /dev/console failed: %m");
926 static int setup_kmsg(const char *dest, int kmsg_socket) {
927 _cleanup_free_ char *from = NULL, *to = NULL;
929 _cleanup_umask_ mode_t u;
931 struct cmsghdr cmsghdr;
932 uint8_t buf[CMSG_SPACE(sizeof(int))];
935 .msg_control = &control,
936 .msg_controllen = sizeof(control),
938 struct cmsghdr *cmsg;
941 assert(kmsg_socket >= 0);
945 /* We create the kmsg FIFO as /dev/kmsg, but immediately
946 * delete it after bind mounting it to /proc/kmsg. While FIFOs
947 * on the reading side behave very similar to /proc/kmsg,
948 * their writing side behaves differently from /dev/kmsg in
949 * that writing blocks when nothing is reading. In order to
950 * avoid any problems with containers deadlocking due to this
951 * we simply make /dev/kmsg unavailable to the container. */
952 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
953 asprintf(&to, "%s/proc/kmsg", dest) < 0)
956 if (mkfifo(from, 0600) < 0) {
957 log_error("mkfifo() for /dev/kmsg failed: %m");
961 r = chmod_and_chown(from, 0600, 0, 0);
963 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
967 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
968 log_error("Bind mount for /proc/kmsg failed: %m");
972 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
974 log_error("Failed to open fifo: %m");
978 cmsg = CMSG_FIRSTHDR(&mh);
979 cmsg->cmsg_level = SOL_SOCKET;
980 cmsg->cmsg_type = SCM_RIGHTS;
981 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
982 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
984 mh.msg_controllen = cmsg->cmsg_len;
986 /* Store away the fd in the socket, so that it stays open as
987 * long as we run the child */
988 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
989 close_nointr_nofail(fd);
992 log_error("Failed to send FIFO fd: %m");
996 /* And now make the FIFO unavailable as /dev/kmsg... */
1001 static int setup_hostname(void) {
1003 if (arg_share_system)
1006 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1012 static int setup_journal(const char *directory) {
1013 sd_id128_t machine_id, this_id;
1014 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1018 p = strappend(directory, "/etc/machine-id");
1022 r = read_one_line_file(p, &b);
1023 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1026 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1031 if (isempty(id) && arg_link_journal == LINK_AUTO)
1034 /* Verify validity */
1035 r = sd_id128_from_string(id, &machine_id);
1037 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1041 r = sd_id128_get_machine(&this_id);
1043 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1047 if (sd_id128_equal(machine_id, this_id)) {
1048 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1049 "Host and machine ids are equal (%s): refusing to link journals", id);
1050 if (arg_link_journal == LINK_AUTO)
1056 if (arg_link_journal == LINK_NO)
1060 p = strappend("/var/log/journal/", id);
1061 q = strjoin(directory, "/var/log/journal/", id, NULL);
1065 if (path_is_mount_point(p, false) > 0) {
1066 if (arg_link_journal != LINK_AUTO) {
1067 log_error("%s: already a mount point, refusing to use for journal", p);
1074 if (path_is_mount_point(q, false) > 0) {
1075 if (arg_link_journal != LINK_AUTO) {
1076 log_error("%s: already a mount point, refusing to use for journal", q);
1083 r = readlink_and_make_absolute(p, &d);
1085 if ((arg_link_journal == LINK_GUEST ||
1086 arg_link_journal == LINK_AUTO) &&
1089 r = mkdir_p(q, 0755);
1091 log_warning("failed to create directory %s: %m", q);
1095 if (unlink(p) < 0) {
1096 log_error("Failed to remove symlink %s: %m", p);
1099 } else if (r == -EINVAL) {
1101 if (arg_link_journal == LINK_GUEST &&
1104 if (errno == ENOTDIR) {
1105 log_error("%s already exists and is neither a symlink nor a directory", p);
1108 log_error("Failed to remove %s: %m", p);
1112 } else if (r != -ENOENT) {
1113 log_error("readlink(%s) failed: %m", p);
1117 if (arg_link_journal == LINK_GUEST) {
1119 if (symlink(q, p) < 0) {
1120 log_error("Failed to symlink %s to %s: %m", q, p);
1124 r = mkdir_p(q, 0755);
1126 log_warning("failed to create directory %s: %m", q);
1130 if (arg_link_journal == LINK_HOST) {
1131 r = mkdir_p(p, 0755);
1133 log_error("Failed to create %s: %m", p);
1137 } else if (access(p, F_OK) < 0)
1140 if (dir_is_empty(q) == 0) {
1141 log_error("%s not empty.", q);
1145 r = mkdir_p(q, 0755);
1147 log_error("Failed to create %s: %m", q);
1151 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1152 log_error("Failed to bind mount journal from host into guest: %m");
1159 static int setup_kdbus(const char *dest, const char *path) {
1165 p = strappenda(dest, "/dev/kdbus");
1166 if (mkdir(p, 0755) < 0) {
1167 log_error("Failed to create kdbus path: %m");
1171 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1172 log_error("Failed to mount kdbus domain path: %m");
1179 static int drop_capabilities(void) {
1180 return capability_bounding_set_drop(~arg_retain, false);
1183 static int register_machine(pid_t pid) {
1184 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1185 _cleanup_bus_unref_ sd_bus *bus = NULL;
1191 r = sd_bus_default_system(&bus);
1193 log_error("Failed to open system bus: %s", strerror(-r));
1197 if (arg_keep_unit) {
1198 r = sd_bus_call_method(
1200 "org.freedesktop.machine1",
1201 "/org/freedesktop/machine1",
1202 "org.freedesktop.machine1.Manager",
1208 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1212 strempty(arg_directory));
1214 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1216 r = sd_bus_message_new_method_call(
1219 "org.freedesktop.machine1",
1220 "/org/freedesktop/machine1",
1221 "org.freedesktop.machine1.Manager",
1224 log_error("Failed to create message: %s", strerror(-r));
1228 r = sd_bus_message_append(
1232 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1236 strempty(arg_directory));
1238 log_error("Failed to append message arguments: %s", strerror(-r));
1242 r = sd_bus_message_open_container(m, 'a', "(sv)");
1244 log_error("Failed to open container: %s", strerror(-r));
1248 if (!isempty(arg_slice)) {
1249 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1251 log_error("Failed to append slice: %s", strerror(-r));
1256 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1258 log_error("Failed to add device policy: %s", strerror(-r));
1262 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1263 /* Allow the container to
1264 * access and create the API
1265 * device nodes, so that
1266 * PrivateDevices= in the
1267 * container can work
1272 "/dev/random", "rwm",
1273 "/dev/urandom", "rwm",
1275 /* Allow the container
1276 * access to ptys. However,
1278 * container to ever create
1279 * these device nodes. */
1280 "/dev/pts/ptmx", "rw",
1282 /* Allow the container
1283 * access to all kdbus
1284 * devices. Again, the
1285 * container cannot create
1286 * these nodes, only use
1287 * them. We use a pretty
1288 * open match here, so that
1289 * the kernel API can still
1292 "char-kdbus/*", "rw");
1294 log_error("Failed to add device whitelist: %s", strerror(-r));
1298 r = sd_bus_message_close_container(m);
1300 log_error("Failed to close container: %s", strerror(-r));
1304 r = sd_bus_call(bus, m, 0, &error, NULL);
1308 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1315 static int terminate_machine(pid_t pid) {
1316 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1317 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1318 _cleanup_bus_unref_ sd_bus *bus = NULL;
1325 r = sd_bus_default_system(&bus);
1327 log_error("Failed to open system bus: %s", strerror(-r));
1331 r = sd_bus_call_method(
1333 "org.freedesktop.machine1",
1334 "/org/freedesktop/machine1",
1335 "org.freedesktop.machine1.Manager",
1342 /* Note that the machine might already have been
1343 * cleaned up automatically, hence don't consider it a
1344 * failure if we cannot get the machine object. */
1345 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1349 r = sd_bus_message_read(reply, "o", &path);
1351 return bus_log_parse_error(r);
1353 r = sd_bus_call_method(
1355 "org.freedesktop.machine1",
1357 "org.freedesktop.machine1.Machine",
1363 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1370 static int reset_audit_loginuid(void) {
1371 _cleanup_free_ char *p = NULL;
1374 if (arg_share_system)
1377 r = read_one_line_file("/proc/self/loginuid", &p);
1381 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1385 /* Already reset? */
1386 if (streq(p, "4294967295"))
1389 r = write_string_file("/proc/self/loginuid", "4294967295");
1391 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1392 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1393 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1394 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1395 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1403 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1405 static int get_mac(struct ether_addr *mac) {
1412 l = strlen(arg_machine);
1413 sz = sizeof(sd_id128_t) + l;
1416 /* fetch some persistent data unique to the host */
1417 r = sd_id128_get_machine((sd_id128_t*) v);
1421 /* combine with some data unique (on this host) to this
1422 * container instance */
1423 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1425 /* Let's hash the host machine ID plus the container name. We
1426 * use a fixed, but originally randomly created hash key here. */
1427 siphash24(result, v, sz, HASH_KEY.bytes);
1429 assert_cc(ETH_ALEN <= sizeof(result));
1430 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1432 /* see eth_random_addr in the kernel */
1433 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1434 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1439 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1440 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1441 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1442 struct ether_addr mac;
1445 if (!arg_private_network)
1448 if (!arg_network_veth)
1451 /* Use two different interface name prefixes depending whether
1452 * we are in bridge mode or not. */
1453 if (arg_network_bridge)
1454 memcpy(iface_name, "vb-", 3);
1456 memcpy(iface_name, "ve-", 3);
1457 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1461 log_error("Failed to generate predictable MAC address for host0");
1465 r = sd_rtnl_open(&rtnl, 0);
1467 log_error("Failed to connect to netlink: %s", strerror(-r));
1471 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1473 log_error("Failed to allocate netlink message: %s", strerror(-r));
1477 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1479 log_error("Failed to add netlink interface name: %s", strerror(-r));
1483 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1485 log_error("Failed to open netlink container: %s", strerror(-r));
1489 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1491 log_error("Failed to append netlink kind: %s", strerror(-r));
1495 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1497 log_error("Failed to open netlink container: %s", strerror(-r));
1501 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1503 log_error("Failed to open netlink container: %s", strerror(-r));
1507 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1509 log_error("Failed to add netlink interface name: %s", strerror(-r));
1513 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1515 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1519 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1521 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1525 r = sd_rtnl_message_close_container(m);
1527 log_error("Failed to close netlink container: %s", strerror(-r));
1531 r = sd_rtnl_message_close_container(m);
1533 log_error("Failed to close netlink container: %s", strerror(-r));
1537 r = sd_rtnl_message_close_container(m);
1539 log_error("Failed to close netlink container: %s", strerror(-r));
1543 r = sd_rtnl_call(rtnl, m, 0, NULL);
1545 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1552 static int setup_bridge(const char veth_name[]) {
1553 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1554 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1557 if (!arg_private_network)
1560 if (!arg_network_veth)
1563 if (!arg_network_bridge)
1566 bridge = (int) if_nametoindex(arg_network_bridge);
1568 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1572 r = sd_rtnl_open(&rtnl, 0);
1574 log_error("Failed to connect to netlink: %s", strerror(-r));
1578 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1580 log_error("Failed to allocate netlink message: %s", strerror(-r));
1584 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1586 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1590 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1592 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1596 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1598 log_error("Failed to add netlink master field: %s", strerror(-r));
1602 r = sd_rtnl_call(rtnl, m, 0, NULL);
1604 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1611 static int parse_interface(struct udev *udev, const char *name) {
1612 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1613 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1616 ifi = (int) if_nametoindex(name);
1618 log_error("Failed to resolve interface %s: %m", name);
1622 sprintf(ifi_str, "n%i", ifi);
1623 d = udev_device_new_from_device_id(udev, ifi_str);
1625 log_error("Failed to get udev device for interface %s: %m", name);
1629 if (udev_device_get_is_initialized(d) <= 0) {
1630 log_error("Network interface %s is not initialized yet.", name);
1637 static int move_network_interfaces(pid_t pid) {
1638 _cleanup_udev_unref_ struct udev *udev = NULL;
1639 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1643 if (!arg_private_network)
1646 if (strv_isempty(arg_network_interfaces))
1649 r = sd_rtnl_open(&rtnl, 0);
1651 log_error("Failed to connect to netlink: %s", strerror(-r));
1657 log_error("Failed to connect to udev.");
1661 STRV_FOREACH(i, arg_network_interfaces) {
1662 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1665 ifi = parse_interface(udev, *i);
1669 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1671 log_error("Failed to allocate netlink message: %s", strerror(-r));
1675 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1677 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1681 r = sd_rtnl_call(rtnl, m, 0, NULL);
1683 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1691 static int setup_macvlan(pid_t pid) {
1692 _cleanup_udev_unref_ struct udev *udev = NULL;
1693 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1697 if (!arg_private_network)
1700 if (strv_isempty(arg_network_macvlan))
1703 r = sd_rtnl_open(&rtnl, 0);
1705 log_error("Failed to connect to netlink: %s", strerror(-r));
1711 log_error("Failed to connect to udev.");
1715 STRV_FOREACH(i, arg_network_macvlan) {
1716 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1717 _cleanup_free_ char *n = NULL;
1720 ifi = parse_interface(udev, *i);
1724 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1726 log_error("Failed to allocate netlink message: %s", strerror(-r));
1730 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1732 log_error("Failed to add netlink interface index: %s", strerror(-r));
1736 n = strappend("mv-", *i);
1740 strshorten(n, IFNAMSIZ-1);
1742 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1744 log_error("Failed to add netlink interface name: %s", strerror(-r));
1748 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1750 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1754 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1756 log_error("Failed to open netlink container: %s", strerror(-r));
1760 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1762 log_error("Failed to append netlink kind: %s", strerror(-r));
1766 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1768 log_error("Failed to open netlink container: %s", strerror(-r));
1772 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1774 log_error("Failed to append macvlan mode: %s", strerror(-r));
1778 r = sd_rtnl_message_close_container(m);
1780 log_error("Failed to close netlink container: %s", strerror(-r));
1784 r = sd_rtnl_message_close_container(m);
1786 log_error("Failed to close netlink container: %s", strerror(-r));
1790 r = sd_rtnl_call(rtnl, m, 0, NULL);
1792 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1800 static int audit_still_doesnt_work_in_containers(void) {
1803 scmp_filter_ctx seccomp;
1807 Audit is broken in containers, much of the userspace audit
1808 hookup will fail if running inside a container. We don't
1809 care and just turn off creation of audit sockets.
1811 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1812 with EAFNOSUPPORT which audit userspace uses as indication
1813 that audit is disabled in the kernel.
1816 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1820 r = seccomp_add_secondary_archs(seccomp);
1822 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1826 r = seccomp_rule_add(
1828 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1831 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1832 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1834 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1838 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1840 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1844 r = seccomp_load(seccomp);
1846 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1849 seccomp_release(seccomp);
1857 static int setup_image(char **device_path, int *loop_nr) {
1858 struct loop_info64 info = {
1859 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1861 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1862 _cleanup_free_ char* loopdev = NULL;
1866 assert(device_path);
1869 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1871 log_error("Failed to open %s: %m", arg_image);
1875 if (fstat(fd, &st) < 0) {
1876 log_error("Failed to stat %s: %m", arg_image);
1880 if (S_ISBLK(st.st_mode)) {
1883 p = strdup(arg_image);
1897 if (!S_ISREG(st.st_mode)) {
1898 log_error("%s is not a regular file or block device: %m", arg_image);
1902 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1904 log_error("Failed to open /dev/loop-control: %m");
1908 nr = ioctl(control, LOOP_CTL_GET_FREE);
1910 log_error("Failed to allocate loop device: %m");
1914 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1917 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1919 log_error("Failed to open loop device %s: %m", loopdev);
1923 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1924 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1929 info.lo_flags |= LO_FLAGS_READ_ONLY;
1931 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1932 log_error("Failed to set loopback settings on %s: %m", loopdev);
1936 *device_path = loopdev;
1947 static int dissect_image(
1949 char **root_device, bool *root_device_rw,
1950 char **home_device, bool *home_device_rw,
1951 char **srv_device, bool *srv_device_rw,
1955 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1956 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1957 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1958 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1959 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1960 _cleanup_udev_unref_ struct udev *udev = NULL;
1961 struct udev_list_entry *first, *item;
1962 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1963 const char *pttype = NULL;
1969 assert(root_device);
1970 assert(home_device);
1974 b = blkid_new_probe();
1979 r = blkid_probe_set_device(b, fd, 0, 0);
1984 log_error("Failed to set device on blkid probe: %m");
1988 blkid_probe_enable_partitions(b, 1);
1989 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1992 r = blkid_do_safeprobe(b);
1993 if (r == -2 || r == 1) {
1994 log_error("Failed to identify any partition table on %s.\n"
1995 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1997 } else if (r != 0) {
2000 log_error("Failed to probe: %m");
2004 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2005 if (!streq_ptr(pttype, "gpt")) {
2006 log_error("Image %s does not carry a GUID Partition Table.\n"
2007 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2012 pl = blkid_probe_get_partitions(b);
2017 log_error("Failed to list partitions of %s", arg_image);
2025 if (fstat(fd, &st) < 0) {
2026 log_error("Failed to stat block device: %m");
2030 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2034 e = udev_enumerate_new(udev);
2038 r = udev_enumerate_add_match_parent(e, d);
2042 r = udev_enumerate_scan_devices(e);
2044 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2048 first = udev_enumerate_get_list_entry(e);
2049 udev_list_entry_foreach(item, first) {
2050 _cleanup_udev_device_unref_ struct udev_device *q;
2051 const char *stype, *node;
2052 unsigned long long flags;
2059 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2064 log_error("Failed to get partition device of %s: %m", arg_image);
2068 qn = udev_device_get_devnum(q);
2072 if (st.st_rdev == qn)
2075 node = udev_device_get_devnode(q);
2079 pp = blkid_partlist_devno_to_partition(pl, qn);
2083 flags = blkid_partition_get_flags(pp);
2084 if (flags & GPT_FLAG_NO_AUTO)
2087 nr = blkid_partition_get_partno(pp);
2091 stype = blkid_partition_get_type_string(pp);
2095 if (sd_id128_from_string(stype, &type_id) < 0)
2098 if (sd_id128_equal(type_id, GPT_HOME)) {
2100 if (home && nr >= home_nr)
2104 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2107 home = strdup(node);
2110 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2112 if (srv && nr >= srv_nr)
2116 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2123 #ifdef GPT_ROOT_NATIVE
2124 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2126 if (root && nr >= root_nr)
2130 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2133 root = strdup(node);
2138 #ifdef GPT_ROOT_SECONDARY
2139 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2141 if (secondary_root && nr >= secondary_root_nr)
2144 secondary_root_nr = nr;
2145 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2148 free(secondary_root);
2149 secondary_root = strdup(node);
2150 if (!secondary_root)
2156 if (!root && !secondary_root) {
2157 log_error("Failed to identify root partition in disk image %s.\n"
2158 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2163 *root_device = root;
2166 *root_device_rw = root_rw;
2168 } else if (secondary_root) {
2169 *root_device = secondary_root;
2170 secondary_root = NULL;
2172 *root_device_rw = secondary_root_rw;
2177 *home_device = home;
2180 *home_device_rw = home_rw;
2187 *srv_device_rw = srv_rw;
2192 log_error("--image= is not supported, compiled without blkid support.");
2197 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2199 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2200 const char *fstype, *p;
2210 p = strappenda(where, directory);
2215 b = blkid_new_probe_from_filename(what);
2219 log_error("Failed to allocate prober for %s: %m", what);
2223 blkid_probe_enable_superblocks(b, 1);
2224 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2227 r = blkid_do_safeprobe(b);
2228 if (r == -1 || r == 1) {
2229 log_error("Cannot determine file system type of %s", what);
2231 } else if (r != 0) {
2234 log_error("Failed to probe %s: %m", what);
2239 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2242 log_error("Failed to determine file system type of %s", what);
2246 if (streq(fstype, "crypto_LUKS")) {
2247 log_error("nspawn currently does not support LUKS disk images.");
2251 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2252 log_error("Failed to mount %s: %m", what);
2258 log_error("--image= is not supported, compiled without blkid support.");
2263 static int mount_devices(
2265 const char *root_device, bool root_device_rw,
2266 const char *home_device, bool home_device_rw,
2267 const char *srv_device, bool srv_device_rw) {
2273 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2275 log_error("Failed to mount root directory: %s", strerror(-r));
2281 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2283 log_error("Failed to mount home directory: %s", strerror(-r));
2289 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2291 log_error("Failed to mount server data directory: %s", strerror(-r));
2299 static void loop_remove(int nr, int *image_fd) {
2300 _cleanup_close_ int control = -1;
2305 if (image_fd && *image_fd >= 0) {
2306 ioctl(*image_fd, LOOP_CLR_FD);
2307 close_nointr_nofail(*image_fd);
2311 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2315 ioctl(control, LOOP_CTL_REMOVE, nr);
2318 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2326 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2327 log_error("Failed to allocate pipe: %m");
2333 log_error("Failed to fork getent child: %m");
2335 } else if (pid == 0) {
2337 char *empty_env = NULL;
2339 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2340 _exit(EXIT_FAILURE);
2342 if (pipe_fds[0] > 2)
2343 close_nointr_nofail(pipe_fds[0]);
2344 if (pipe_fds[1] > 2)
2345 close_nointr_nofail(pipe_fds[1]);
2347 nullfd = open("/dev/null", O_RDWR);
2349 _exit(EXIT_FAILURE);
2351 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2352 _exit(EXIT_FAILURE);
2354 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2355 _exit(EXIT_FAILURE);
2358 close_nointr_nofail(nullfd);
2360 reset_all_signal_handlers();
2361 close_all_fds(NULL, 0);
2363 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2364 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2365 _exit(EXIT_FAILURE);
2368 close_nointr_nofail(pipe_fds[1]);
2376 static int change_uid_gid(char **_home) {
2377 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2378 _cleanup_free_ uid_t *uids = NULL;
2379 _cleanup_free_ char *home = NULL;
2380 _cleanup_fclose_ FILE *f = NULL;
2381 _cleanup_close_ int fd = -1;
2382 unsigned n_uids = 0;
2391 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2392 /* Reset everything fully to 0, just in case */
2394 if (setgroups(0, NULL) < 0) {
2395 log_error("setgroups() failed: %m");
2399 if (setresgid(0, 0, 0) < 0) {
2400 log_error("setregid() failed: %m");
2404 if (setresuid(0, 0, 0) < 0) {
2405 log_error("setreuid() failed: %m");
2413 /* First, get user credentials */
2414 fd = spawn_getent("passwd", arg_user, &pid);
2418 f = fdopen(fd, "r");
2423 if (!fgets(line, sizeof(line), f)) {
2426 log_error("Failed to resolve user %s.", arg_user);
2430 log_error("Failed to read from getent: %m");
2436 wait_for_terminate_and_warn("getent passwd", pid);
2438 x = strchr(line, ':');
2440 log_error("/etc/passwd entry has invalid user field.");
2444 u = strchr(x+1, ':');
2446 log_error("/etc/passwd entry has invalid password field.");
2453 log_error("/etc/passwd entry has invalid UID field.");
2461 log_error("/etc/passwd entry has invalid GID field.");
2466 h = strchr(x+1, ':');
2468 log_error("/etc/passwd entry has invalid GECOS field.");
2475 log_error("/etc/passwd entry has invalid home directory field.");
2481 r = parse_uid(u, &uid);
2483 log_error("Failed to parse UID of user.");
2487 r = parse_gid(g, &gid);
2489 log_error("Failed to parse GID of user.");
2497 /* Second, get group memberships */
2498 fd = spawn_getent("initgroups", arg_user, &pid);
2503 f = fdopen(fd, "r");
2508 if (!fgets(line, sizeof(line), f)) {
2510 log_error("Failed to resolve user %s.", arg_user);
2514 log_error("Failed to read from getent: %m");
2520 wait_for_terminate_and_warn("getent initgroups", pid);
2522 /* Skip over the username and subsequent separator whitespace */
2524 x += strcspn(x, WHITESPACE);
2525 x += strspn(x, WHITESPACE);
2527 FOREACH_WORD(w, l, x, state) {
2533 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2536 r = parse_uid(c, &uids[n_uids++]);
2538 log_error("Failed to parse group data from getent.");
2543 r = mkdir_parents(home, 0775);
2545 log_error("Failed to make home root directory: %s", strerror(-r));
2549 r = mkdir_safe(home, 0755, uid, gid);
2550 if (r < 0 && r != -EEXIST) {
2551 log_error("Failed to make home directory: %s", strerror(-r));
2555 fchown(STDIN_FILENO, uid, gid);
2556 fchown(STDOUT_FILENO, uid, gid);
2557 fchown(STDERR_FILENO, uid, gid);
2559 if (setgroups(n_uids, uids) < 0) {
2560 log_error("Failed to set auxiliary groups: %m");
2564 if (setresgid(gid, gid, gid) < 0) {
2565 log_error("setregid() failed: %m");
2569 if (setresuid(uid, uid, uid) < 0) {
2570 log_error("setreuid() failed: %m");
2582 int main(int argc, char *argv[]) {
2584 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2585 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2586 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2587 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
2588 _cleanup_fdset_free_ FDSet *fds = NULL;
2589 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2590 const char *console = NULL;
2591 char veth_name[IFNAMSIZ];
2592 bool secondary = false;
2596 log_parse_environment();
2599 k = parse_argv(argc, argv);
2608 if (arg_directory) {
2611 p = path_make_absolute_cwd(arg_directory);
2612 free(arg_directory);
2615 arg_directory = get_current_dir_name();
2617 if (!arg_directory) {
2618 log_error("Failed to determine path, please use -D.");
2621 path_kill_slashes(arg_directory);
2625 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2631 hostname_cleanup(arg_machine, false);
2632 if (isempty(arg_machine)) {
2633 log_error("Failed to determine machine name automatically, please use -M.");
2638 if (geteuid() != 0) {
2639 log_error("Need to be root.");
2643 if (sd_booted() <= 0) {
2644 log_error("Not running on a systemd system.");
2649 n_fd_passed = sd_listen_fds(false);
2650 if (n_fd_passed > 0) {
2651 k = fdset_new_listen_fds(&fds, false);
2653 log_error("Failed to collect file descriptors: %s", strerror(-k));
2657 fdset_close_others(fds);
2660 if (arg_directory) {
2661 if (path_equal(arg_directory, "/")) {
2662 log_error("Spawning container on root directory not supported.");
2667 if (path_is_os_tree(arg_directory) <= 0) {
2668 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2674 p = strappenda(arg_directory,
2675 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2676 if (access(p, F_OK) < 0) {
2677 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2683 char template[] = "/tmp/nspawn-root-XXXXXX";
2685 if (!mkdtemp(template)) {
2686 log_error("Failed to create temporary directory: %m");
2691 arg_directory = strdup(template);
2692 if (!arg_directory) {
2697 image_fd = setup_image(&device_path, &loop_nr);
2703 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2708 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2710 log_error("Failed to acquire pseudo tty: %m");
2714 console = ptsname(master);
2716 log_error("Failed to determine tty name: %m");
2721 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2723 if (unlockpt(master) < 0) {
2724 log_error("Failed to unlock tty: %m");
2728 if (access("/dev/kdbus/control", F_OK) >= 0) {
2730 if (arg_share_system) {
2731 kdbus_domain = strdup("/dev/kdbus");
2732 if (!kdbus_domain) {
2739 ns = strappenda("machine-", arg_machine);
2740 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2742 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2744 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2748 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2749 log_error("Failed to create kmsg socket pair: %m");
2753 sd_notify(0, "READY=1");
2755 assert_se(sigemptyset(&mask) == 0);
2756 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2757 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2760 int parent_ready_fd = -1, child_ready_fd = -1;
2764 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2765 if (parent_ready_fd < 0) {
2766 log_error("Failed to create event fd: %m");
2770 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2771 if (child_ready_fd < 0) {
2772 log_error("Failed to create event fd: %m");
2776 pid = syscall(__NR_clone,
2777 SIGCHLD|CLONE_NEWNS|
2778 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2779 (arg_private_network ? CLONE_NEWNET : 0), NULL);
2781 if (errno == EINVAL)
2782 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2784 log_error("clone() failed: %m");
2791 _cleanup_free_ char *home = NULL;
2793 const char *envp[] = {
2794 "PATH=" DEFAULT_PATH_SPLIT_USR,
2795 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2800 NULL, /* container_uuid */
2801 NULL, /* LISTEN_FDS */
2802 NULL, /* LISTEN_PID */
2807 envp[n_env] = strv_find_prefix(environ, "TERM=");
2811 close_nointr_nofail(master);
2814 close_nointr(STDIN_FILENO);
2815 close_nointr(STDOUT_FILENO);
2816 close_nointr(STDERR_FILENO);
2818 close_nointr_nofail(kmsg_socket_pair[0]);
2819 kmsg_socket_pair[0] = -1;
2821 reset_all_signal_handlers();
2823 assert_se(sigemptyset(&mask) == 0);
2824 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2826 k = open_terminal(console, O_RDWR);
2827 if (k != STDIN_FILENO) {
2829 close_nointr_nofail(k);
2833 log_error("Failed to open console: %s", strerror(-k));
2837 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2838 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2839 log_error("Failed to duplicate console: %m");
2844 log_error("setsid() failed: %m");
2848 if (reset_audit_loginuid() < 0)
2851 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2852 log_error("PR_SET_PDEATHSIG failed: %m");
2856 /* Mark everything as slave, so that we still
2857 * receive mounts from the real root, but don't
2858 * propagate mounts to the real root. */
2859 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2860 log_error("MS_SLAVE|MS_REC failed: %m");
2864 if (mount_devices(arg_directory,
2865 root_device, root_device_rw,
2866 home_device, home_device_rw,
2867 srv_device, srv_device_rw) < 0)
2870 /* Turn directory into bind mount */
2871 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2872 log_error("Failed to make bind mount.");
2877 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2878 log_error("Failed to make read-only.");
2882 if (mount_all(arg_directory) < 0)
2885 if (copy_devnodes(arg_directory) < 0)
2888 if (setup_ptmx(arg_directory) < 0)
2891 dev_setup(arg_directory);
2893 if (audit_still_doesnt_work_in_containers() < 0)
2896 if (setup_dev_console(arg_directory, console) < 0)
2899 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2902 close_nointr_nofail(kmsg_socket_pair[1]);
2903 kmsg_socket_pair[1] = -1;
2905 if (setup_boot_id(arg_directory) < 0)
2908 if (setup_timezone(arg_directory) < 0)
2911 if (setup_resolv_conf(arg_directory) < 0)
2914 if (setup_journal(arg_directory) < 0)
2917 if (mount_binds(arg_directory, arg_bind, 0) < 0)
2920 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2923 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2926 /* Tell the parent that we are ready, and that
2927 * it can cgroupify us to that we lack access
2928 * to certain devices and resources. */
2929 eventfd_write(child_ready_fd, 1);
2930 close_nointr_nofail(child_ready_fd);
2931 child_ready_fd = -1;
2933 if (chdir(arg_directory) < 0) {
2934 log_error("chdir(%s) failed: %m", arg_directory);
2938 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2939 log_error("mount(MS_MOVE) failed: %m");
2943 if (chroot(".") < 0) {
2944 log_error("chroot() failed: %m");
2948 if (chdir("/") < 0) {
2949 log_error("chdir() failed: %m");
2955 if (arg_private_network)
2958 if (drop_capabilities() < 0) {
2959 log_error("drop_capabilities() failed: %m");
2963 r = change_uid_gid(&home);
2967 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2968 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2969 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2974 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2975 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2981 if (fdset_size(fds) > 0) {
2982 k = fdset_cloexec(fds, false);
2984 log_error("Failed to unset O_CLOEXEC for file descriptors.");
2988 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2989 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2997 if (arg_personality != 0xffffffffLU) {
2998 if (personality(arg_personality) < 0) {
2999 log_error("personality() failed: %m");
3002 } else if (secondary) {
3003 if (personality(PER_LINUX32) < 0) {
3004 log_error("personality() failed: %m");
3010 if (arg_selinux_context)
3011 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3012 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3017 if (!strv_isempty(arg_setenv)) {
3020 n = strv_env_merge(2, envp, arg_setenv);
3028 env_use = (char**) envp;
3030 /* Wait until the parent is ready with the setup, too... */
3031 eventfd_read(parent_ready_fd, &x);
3032 close_nointr_nofail(parent_ready_fd);
3033 parent_ready_fd = -1;
3039 /* Automatically search for the init system */
3041 l = 1 + argc - optind;
3042 a = newa(char*, l + 1);
3043 memcpy(a + 1, argv + optind, l * sizeof(char*));
3045 a[0] = (char*) "/usr/lib/systemd/systemd";
3046 execve(a[0], a, env_use);
3048 a[0] = (char*) "/lib/systemd/systemd";
3049 execve(a[0], a, env_use);
3051 a[0] = (char*) "/sbin/init";
3052 execve(a[0], a, env_use);
3053 } else if (argc > optind)
3054 execvpe(argv[optind], argv + optind, env_use);
3056 chdir(home ? home : "/root");
3057 execle("/bin/bash", "-bash", NULL, env_use);
3058 execle("/bin/sh", "-sh", NULL, env_use);
3061 log_error("execv() failed: %m");
3064 _exit(EXIT_FAILURE);
3070 /* Wait until the child reported that it is ready with
3071 * all it needs to do with priviliges. After we got
3072 * the notification we can make the process join its
3073 * cgroup which might limit what it can do */
3074 eventfd_read(child_ready_fd, &x);
3076 r = register_machine(pid);
3080 r = move_network_interfaces(pid);
3084 r = setup_veth(pid, veth_name);
3088 r = setup_bridge(veth_name);
3092 r = setup_macvlan(pid);
3096 /* Notify the child that the parent is ready with all
3097 * its setup, and thtat the child can now hand over
3098 * control to the code to run inside the container. */
3099 eventfd_write(parent_ready_fd, 1);
3101 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3110 /* Kill if it is not dead yet anyway */
3111 terminate_machine(pid);
3113 /* Redundant, but better safe than sorry */
3116 k = wait_for_terminate(pid, &status);
3124 if (status.si_code == CLD_EXITED) {
3125 r = status.si_status;
3126 if (status.si_status != 0) {
3127 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3132 log_debug("Container %s exited successfully.", arg_machine);
3134 } else if (status.si_code == CLD_KILLED &&
3135 status.si_status == SIGINT) {
3138 log_info("Container %s has been shut down.", arg_machine);
3141 } else if (status.si_code == CLD_KILLED &&
3142 status.si_status == SIGHUP) {
3145 log_info("Container %s is being rebooted.", arg_machine);
3147 } else if (status.si_code == CLD_KILLED ||
3148 status.si_code == CLD_DUMPED) {
3150 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3154 log_error("Container %s failed due to unknown reason.", arg_machine);
3161 loop_remove(loop_nr, &image_fd);
3166 free(arg_directory);
3169 strv_free(arg_setenv);
3170 strv_free(arg_network_interfaces);
3171 strv_free(arg_network_macvlan);
3172 strv_free(arg_bind);
3173 strv_free(arg_bind_ro);