1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
89 #include "siphash24.h"
92 #include "seccomp-util.h"
95 typedef enum LinkJournal {
102 static char *arg_directory = NULL;
103 static char *arg_user = NULL;
104 static sd_id128_t arg_uuid = {};
105 static char *arg_machine = NULL;
106 static const char *arg_selinux_context = NULL;
107 static const char *arg_selinux_apifs_context = NULL;
108 static const char *arg_slice = NULL;
109 static bool arg_private_network = false;
110 static bool arg_read_only = false;
111 static bool arg_boot = false;
112 static LinkJournal arg_link_journal = LINK_AUTO;
113 static uint64_t arg_retain =
114 (1ULL << CAP_CHOWN) |
115 (1ULL << CAP_DAC_OVERRIDE) |
116 (1ULL << CAP_DAC_READ_SEARCH) |
117 (1ULL << CAP_FOWNER) |
118 (1ULL << CAP_FSETID) |
119 (1ULL << CAP_IPC_OWNER) |
121 (1ULL << CAP_LEASE) |
122 (1ULL << CAP_LINUX_IMMUTABLE) |
123 (1ULL << CAP_NET_BIND_SERVICE) |
124 (1ULL << CAP_NET_BROADCAST) |
125 (1ULL << CAP_NET_RAW) |
126 (1ULL << CAP_SETGID) |
127 (1ULL << CAP_SETFCAP) |
128 (1ULL << CAP_SETPCAP) |
129 (1ULL << CAP_SETUID) |
130 (1ULL << CAP_SYS_ADMIN) |
131 (1ULL << CAP_SYS_CHROOT) |
132 (1ULL << CAP_SYS_NICE) |
133 (1ULL << CAP_SYS_PTRACE) |
134 (1ULL << CAP_SYS_TTY_CONFIG) |
135 (1ULL << CAP_SYS_RESOURCE) |
136 (1ULL << CAP_SYS_BOOT) |
137 (1ULL << CAP_AUDIT_WRITE) |
138 (1ULL << CAP_AUDIT_CONTROL) |
140 static char **arg_bind = NULL;
141 static char **arg_bind_ro = NULL;
142 static char **arg_setenv = NULL;
143 static bool arg_quiet = false;
144 static bool arg_share_system = false;
145 static bool arg_register = true;
146 static bool arg_keep_unit = false;
147 static char **arg_network_interfaces = NULL;
148 static char **arg_network_macvlan = NULL;
149 static bool arg_network_veth = false;
150 static const char *arg_network_bridge = NULL;
151 static unsigned long arg_personality = 0xffffffffLU;
152 static const char *arg_image = NULL;
154 static int help(void) {
156 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
157 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
158 " -h --help Show this help\n"
159 " --version Print version string\n"
160 " -q --quiet Do not show status information\n"
161 " -D --directory=PATH Root directory for the container\n"
162 " -i --image=PATH File system device or image for the container\n"
163 " -b --boot Boot up full system (i.e. invoke init)\n"
164 " -u --user=USER Run the command under specified user or uid\n"
165 " -M --machine=NAME Set the machine name for the container\n"
166 " --uuid=UUID Set a specific machine UUID for the container\n"
167 " -S --slice=SLICE Place the container in the specified slice\n"
168 " --private-network Disable network in container\n"
169 " --network-interface=INTERFACE\n"
170 " Assign an existing network interface to the\n"
172 " --network-macvlan=INTERFACE\n"
173 " Create a macvlan network interface based on an\n"
174 " existing network interface to the container\n"
175 " --network-veth Add a virtual ethernet connection between host\n"
177 " --network-bridge=INTERFACE\n"
178 " Add a virtual ethernet connection between host\n"
179 " and container and add it to an existing bridge on\n"
181 " -Z --selinux-context=SECLABEL\n"
182 " Set the SELinux security context to be used by\n"
183 " processes in the container\n"
184 " -L --selinux-apifs-context=SECLABEL\n"
185 " Set the SELinux security context to be used by\n"
186 " API/tmpfs file systems in the container\n"
187 " --capability=CAP In addition to the default, retain specified\n"
189 " --drop-capability=CAP Drop the specified capability from the default set\n"
190 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
191 " -j Equivalent to --link-journal=host\n"
192 " --read-only Mount the root directory read-only\n"
193 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
195 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
196 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
197 " --share-system Share system namespaces with host\n"
198 " --register=BOOLEAN Register container as machine\n"
199 " --keep-unit Do not register a scope for the machine, reuse\n"
200 " the service unit nspawn is running in\n",
201 program_invocation_short_name);
206 static int parse_argv(int argc, char *argv[]) {
222 ARG_NETWORK_INTERFACE,
229 static const struct option options[] = {
230 { "help", no_argument, NULL, 'h' },
231 { "version", no_argument, NULL, ARG_VERSION },
232 { "directory", required_argument, NULL, 'D' },
233 { "user", required_argument, NULL, 'u' },
234 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
235 { "boot", no_argument, NULL, 'b' },
236 { "uuid", required_argument, NULL, ARG_UUID },
237 { "read-only", no_argument, NULL, ARG_READ_ONLY },
238 { "capability", required_argument, NULL, ARG_CAPABILITY },
239 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
240 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
241 { "bind", required_argument, NULL, ARG_BIND },
242 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
243 { "machine", required_argument, NULL, 'M' },
244 { "slice", required_argument, NULL, 'S' },
245 { "setenv", required_argument, NULL, ARG_SETENV },
246 { "selinux-context", required_argument, NULL, 'Z' },
247 { "selinux-apifs-context", required_argument, NULL, 'L' },
248 { "quiet", no_argument, NULL, 'q' },
249 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
250 { "register", required_argument, NULL, ARG_REGISTER },
251 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
252 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
253 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
254 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
255 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
256 { "personality", required_argument, NULL, ARG_PERSONALITY },
257 { "image", required_argument, NULL, 'i' },
262 uint64_t plus = 0, minus = 0;
267 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
275 puts(PACKAGE_STRING);
276 puts(SYSTEMD_FEATURES);
281 arg_directory = canonicalize_file_name(optarg);
282 if (!arg_directory) {
283 log_error("Invalid root directory: %m");
295 arg_user = strdup(optarg);
301 case ARG_NETWORK_BRIDGE:
302 arg_network_bridge = optarg;
306 case ARG_NETWORK_VETH:
307 arg_network_veth = true;
308 arg_private_network = true;
311 case ARG_NETWORK_INTERFACE:
312 if (strv_extend(&arg_network_interfaces, optarg) < 0)
315 arg_private_network = true;
318 case ARG_NETWORK_MACVLAN:
319 if (strv_extend(&arg_network_macvlan, optarg) < 0)
324 case ARG_PRIVATE_NETWORK:
325 arg_private_network = true;
333 r = sd_id128_from_string(optarg, &arg_uuid);
335 log_error("Invalid UUID: %s", optarg);
345 if (isempty(optarg)) {
350 if (!hostname_is_valid(optarg)) {
351 log_error("Invalid machine name: %s", optarg);
356 arg_machine = strdup(optarg);
364 arg_selinux_context = optarg;
368 arg_selinux_apifs_context = optarg;
372 arg_read_only = true;
376 case ARG_DROP_CAPABILITY: {
380 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
381 _cleanup_free_ char *t;
384 t = strndup(word, length);
388 if (streq(t, "all")) {
389 if (c == ARG_CAPABILITY)
390 plus = (uint64_t) -1;
392 minus = (uint64_t) -1;
394 if (cap_from_name(t, &cap) < 0) {
395 log_error("Failed to parse capability %s.", t);
399 if (c == ARG_CAPABILITY)
400 plus |= 1ULL << (uint64_t) cap;
402 minus |= 1ULL << (uint64_t) cap;
410 arg_link_journal = LINK_GUEST;
413 case ARG_LINK_JOURNAL:
414 if (streq(optarg, "auto"))
415 arg_link_journal = LINK_AUTO;
416 else if (streq(optarg, "no"))
417 arg_link_journal = LINK_NO;
418 else if (streq(optarg, "guest"))
419 arg_link_journal = LINK_GUEST;
420 else if (streq(optarg, "host"))
421 arg_link_journal = LINK_HOST;
423 log_error("Failed to parse link journal mode %s", optarg);
431 _cleanup_free_ char *a = NULL, *b = NULL;
435 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
437 e = strchr(optarg, ':');
439 a = strndup(optarg, e - optarg);
449 if (!path_is_absolute(a) || !path_is_absolute(b)) {
450 log_error("Invalid bind mount specification: %s", optarg);
454 r = strv_extend(x, a);
458 r = strv_extend(x, b);
468 if (!env_assignment_is_valid(optarg)) {
469 log_error("Environment variable assignment '%s' is not valid.", optarg);
473 n = strv_env_set(arg_setenv, optarg);
477 strv_free(arg_setenv);
486 case ARG_SHARE_SYSTEM:
487 arg_share_system = true;
491 r = parse_boolean(optarg);
493 log_error("Failed to parse --register= argument: %s", optarg);
501 arg_keep_unit = true;
504 case ARG_PERSONALITY:
506 arg_personality = personality_from_string(optarg);
507 if (arg_personality == 0xffffffffLU) {
508 log_error("Unknown or unsupported personality '%s'.", optarg);
518 assert_not_reached("Unhandled option");
522 if (arg_share_system)
523 arg_register = false;
525 if (arg_boot && arg_share_system) {
526 log_error("--boot and --share-system may not be combined.");
530 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
531 log_error("--keep-unit may not be used when invoked from a user session.");
535 if (arg_directory && arg_image) {
536 log_error("--directory= and --image= may not be combined.");
540 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
545 static int mount_all(const char *dest) {
547 typedef struct MountPoint {
556 static const MountPoint mount_table[] = {
557 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
558 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
559 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
560 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
561 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
562 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
563 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
564 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
566 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
567 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
574 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575 _cleanup_free_ char *where = NULL;
577 _cleanup_free_ char *options = NULL;
582 where = strjoin(dest, "/", mount_table[k].where, NULL);
586 t = path_is_mount_point(where, true);
588 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
596 /* Skip this entry if it is not a remount. */
597 if (mount_table[k].what && t > 0)
600 mkdir_p(where, 0755);
603 if (arg_selinux_apifs_context &&
604 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
605 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
612 o = mount_table[k].options;
615 if (mount(mount_table[k].what,
618 mount_table[k].flags,
620 mount_table[k].fatal) {
622 log_error("mount(%s) failed: %m", where);
632 static int mount_binds(const char *dest, char **l, unsigned long flags) {
635 STRV_FOREACH_PAIR(x, y, l) {
637 struct stat source_st, dest_st;
640 if (stat(*x, &source_st) < 0) {
641 log_error("Failed to stat %s: %m", *x);
645 where = strappenda(dest, *y);
646 r = stat(where, &dest_st);
648 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
649 log_error("The file types of %s and %s do not match. Refusing bind mount",
653 } else if (errno == ENOENT) {
654 r = mkdir_parents_label(where, 0755);
656 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
660 log_error("Failed to bind mount %s: %m", *x);
663 /* Create the mount point, but be conservative -- refuse to create block
664 * and char devices. */
665 if (S_ISDIR(source_st.st_mode))
666 mkdir_label(where, 0755);
667 else if (S_ISFIFO(source_st.st_mode))
669 else if (S_ISSOCK(source_st.st_mode))
670 mknod(where, 0644 | S_IFSOCK, 0);
671 else if (S_ISREG(source_st.st_mode))
674 log_error("Refusing to create mountpoint for file: %s", *x);
678 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
679 log_error("mount(%s) failed: %m", where);
683 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
684 log_error("mount(%s) failed: %m", where);
692 static int setup_timezone(const char *dest) {
693 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
699 /* Fix the timezone, if possible */
700 r = readlink_malloc("/etc/localtime", &p);
702 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
706 z = path_startswith(p, "../usr/share/zoneinfo/");
708 z = path_startswith(p, "/usr/share/zoneinfo/");
710 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
714 where = strappend(dest, "/etc/localtime");
718 r = readlink_malloc(where, &q);
720 y = path_startswith(q, "../usr/share/zoneinfo/");
722 y = path_startswith(q, "/usr/share/zoneinfo/");
725 /* Already pointing to the right place? Then do nothing .. */
726 if (y && streq(y, z))
730 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
734 if (access(check, F_OK) < 0) {
735 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
739 what = strappend("../usr/share/zoneinfo/", z);
744 if (symlink(what, where) < 0) {
745 log_error("Failed to correct timezone of container: %m");
752 static int setup_resolv_conf(const char *dest) {
753 char _cleanup_free_ *where = NULL;
757 if (arg_private_network)
760 /* Fix resolv.conf, if possible */
761 where = strappend(dest, "/etc/resolv.conf");
765 /* We don't really care for the results of this really. If it
766 * fails, it fails, but meh... */
767 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
772 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
775 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
776 SD_ID128_FORMAT_VAL(id));
781 static int setup_boot_id(const char *dest) {
782 _cleanup_free_ char *from = NULL, *to = NULL;
789 if (arg_share_system)
792 /* Generate a new randomized boot ID, so that each boot-up of
793 * the container gets a new one */
795 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
796 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
800 r = sd_id128_randomize(&rnd);
802 log_error("Failed to generate random boot id: %s", strerror(-r));
806 id128_format_as_uuid(rnd, as_uuid);
808 r = write_string_file(from, as_uuid);
810 log_error("Failed to write boot id: %s", strerror(-r));
814 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
815 log_error("Failed to bind mount boot id: %m");
817 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
818 log_warning("Failed to make boot id read-only: %m");
824 static int copy_devnodes(const char *dest) {
826 static const char devnodes[] =
836 _cleanup_umask_ mode_t u;
842 NULSTR_FOREACH(d, devnodes) {
843 _cleanup_free_ char *from = NULL, *to = NULL;
846 from = strappend("/dev/", d);
847 to = strjoin(dest, "/dev/", d, NULL);
851 if (stat(from, &st) < 0) {
853 if (errno != ENOENT) {
854 log_error("Failed to stat %s: %m", from);
858 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
860 log_error("%s is not a char or block device, cannot copy", from);
863 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
865 log_error("mknod(%s) failed: %m", dest);
873 static int setup_ptmx(const char *dest) {
874 _cleanup_free_ char *p = NULL;
876 p = strappend(dest, "/dev/ptmx");
880 if (symlink("pts/ptmx", p) < 0) {
881 log_error("Failed to create /dev/ptmx symlink: %m");
888 static int setup_dev_console(const char *dest, const char *console) {
889 _cleanup_umask_ mode_t u;
899 if (stat("/dev/null", &st) < 0) {
900 log_error("Failed to stat /dev/null: %m");
904 r = chmod_and_chown(console, 0600, 0, 0);
906 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
910 /* We need to bind mount the right tty to /dev/console since
911 * ptys can only exist on pts file systems. To have something
912 * to bind mount things on we create a device node first, and
913 * use /dev/null for that since we the cgroups device policy
914 * allows us to create that freely, while we cannot create
915 * /dev/console. (Note that the major minor doesn't actually
916 * matter here, since we mount it over anyway). */
918 to = strappenda(dest, "/dev/console");
919 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
920 log_error("mknod() for /dev/console failed: %m");
924 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
925 log_error("Bind mount for /dev/console failed: %m");
932 static int setup_kmsg(const char *dest, int kmsg_socket) {
933 _cleanup_free_ char *from = NULL, *to = NULL;
935 _cleanup_umask_ mode_t u;
937 struct cmsghdr cmsghdr;
938 uint8_t buf[CMSG_SPACE(sizeof(int))];
941 .msg_control = &control,
942 .msg_controllen = sizeof(control),
944 struct cmsghdr *cmsg;
947 assert(kmsg_socket >= 0);
951 /* We create the kmsg FIFO as /dev/kmsg, but immediately
952 * delete it after bind mounting it to /proc/kmsg. While FIFOs
953 * on the reading side behave very similar to /proc/kmsg,
954 * their writing side behaves differently from /dev/kmsg in
955 * that writing blocks when nothing is reading. In order to
956 * avoid any problems with containers deadlocking due to this
957 * we simply make /dev/kmsg unavailable to the container. */
958 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
959 asprintf(&to, "%s/proc/kmsg", dest) < 0)
962 if (mkfifo(from, 0600) < 0) {
963 log_error("mkfifo() for /dev/kmsg failed: %m");
967 r = chmod_and_chown(from, 0600, 0, 0);
969 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
973 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
974 log_error("Bind mount for /proc/kmsg failed: %m");
978 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
980 log_error("Failed to open fifo: %m");
984 cmsg = CMSG_FIRSTHDR(&mh);
985 cmsg->cmsg_level = SOL_SOCKET;
986 cmsg->cmsg_type = SCM_RIGHTS;
987 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
988 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
990 mh.msg_controllen = cmsg->cmsg_len;
992 /* Store away the fd in the socket, so that it stays open as
993 * long as we run the child */
994 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
998 log_error("Failed to send FIFO fd: %m");
1002 /* And now make the FIFO unavailable as /dev/kmsg... */
1007 static int setup_hostname(void) {
1009 if (arg_share_system)
1012 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1018 static int setup_journal(const char *directory) {
1019 sd_id128_t machine_id, this_id;
1020 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1024 p = strappend(directory, "/etc/machine-id");
1028 r = read_one_line_file(p, &b);
1029 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1032 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1037 if (isempty(id) && arg_link_journal == LINK_AUTO)
1040 /* Verify validity */
1041 r = sd_id128_from_string(id, &machine_id);
1043 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1047 r = sd_id128_get_machine(&this_id);
1049 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1053 if (sd_id128_equal(machine_id, this_id)) {
1054 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1055 "Host and machine ids are equal (%s): refusing to link journals", id);
1056 if (arg_link_journal == LINK_AUTO)
1062 if (arg_link_journal == LINK_NO)
1066 p = strappend("/var/log/journal/", id);
1067 q = strjoin(directory, "/var/log/journal/", id, NULL);
1071 if (path_is_mount_point(p, false) > 0) {
1072 if (arg_link_journal != LINK_AUTO) {
1073 log_error("%s: already a mount point, refusing to use for journal", p);
1080 if (path_is_mount_point(q, false) > 0) {
1081 if (arg_link_journal != LINK_AUTO) {
1082 log_error("%s: already a mount point, refusing to use for journal", q);
1089 r = readlink_and_make_absolute(p, &d);
1091 if ((arg_link_journal == LINK_GUEST ||
1092 arg_link_journal == LINK_AUTO) &&
1095 r = mkdir_p(q, 0755);
1097 log_warning("failed to create directory %s: %m", q);
1101 if (unlink(p) < 0) {
1102 log_error("Failed to remove symlink %s: %m", p);
1105 } else if (r == -EINVAL) {
1107 if (arg_link_journal == LINK_GUEST &&
1110 if (errno == ENOTDIR) {
1111 log_error("%s already exists and is neither a symlink nor a directory", p);
1114 log_error("Failed to remove %s: %m", p);
1118 } else if (r != -ENOENT) {
1119 log_error("readlink(%s) failed: %m", p);
1123 if (arg_link_journal == LINK_GUEST) {
1125 if (symlink(q, p) < 0) {
1126 log_error("Failed to symlink %s to %s: %m", q, p);
1130 r = mkdir_p(q, 0755);
1132 log_warning("failed to create directory %s: %m", q);
1136 if (arg_link_journal == LINK_HOST) {
1137 r = mkdir_p(p, 0755);
1139 log_error("Failed to create %s: %m", p);
1143 } else if (access(p, F_OK) < 0)
1146 if (dir_is_empty(q) == 0)
1147 log_warning("%s is not empty, proceeding anyway.", q);
1149 r = mkdir_p(q, 0755);
1151 log_error("Failed to create %s: %m", q);
1155 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1156 log_error("Failed to bind mount journal from host into guest: %m");
1163 static int setup_kdbus(const char *dest, const char *path) {
1169 p = strappenda(dest, "/dev/kdbus");
1170 if (mkdir(p, 0755) < 0) {
1171 log_error("Failed to create kdbus path: %m");
1175 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1176 log_error("Failed to mount kdbus domain path: %m");
1183 static int drop_capabilities(void) {
1184 return capability_bounding_set_drop(~arg_retain, false);
1187 static int register_machine(pid_t pid) {
1188 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1189 _cleanup_bus_unref_ sd_bus *bus = NULL;
1195 r = sd_bus_default_system(&bus);
1197 log_error("Failed to open system bus: %s", strerror(-r));
1201 if (arg_keep_unit) {
1202 r = sd_bus_call_method(
1204 "org.freedesktop.machine1",
1205 "/org/freedesktop/machine1",
1206 "org.freedesktop.machine1.Manager",
1212 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1216 strempty(arg_directory));
1218 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1220 r = sd_bus_message_new_method_call(
1223 "org.freedesktop.machine1",
1224 "/org/freedesktop/machine1",
1225 "org.freedesktop.machine1.Manager",
1228 log_error("Failed to create message: %s", strerror(-r));
1232 r = sd_bus_message_append(
1236 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1240 strempty(arg_directory));
1242 log_error("Failed to append message arguments: %s", strerror(-r));
1246 r = sd_bus_message_open_container(m, 'a', "(sv)");
1248 log_error("Failed to open container: %s", strerror(-r));
1252 if (!isempty(arg_slice)) {
1253 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1255 log_error("Failed to append slice: %s", strerror(-r));
1260 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1262 log_error("Failed to add device policy: %s", strerror(-r));
1266 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1267 /* Allow the container to
1268 * access and create the API
1269 * device nodes, so that
1270 * PrivateDevices= in the
1271 * container can work
1276 "/dev/random", "rwm",
1277 "/dev/urandom", "rwm",
1279 /* Allow the container
1280 * access to ptys. However,
1282 * container to ever create
1283 * these device nodes. */
1284 "/dev/pts/ptmx", "rw",
1286 /* Allow the container
1287 * access to all kdbus
1288 * devices. Again, the
1289 * container cannot create
1290 * these nodes, only use
1291 * them. We use a pretty
1292 * open match here, so that
1293 * the kernel API can still
1296 "char-kdbus/*", "rw");
1298 log_error("Failed to add device whitelist: %s", strerror(-r));
1302 r = sd_bus_message_close_container(m);
1304 log_error("Failed to close container: %s", strerror(-r));
1308 r = sd_bus_call(bus, m, 0, &error, NULL);
1312 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1319 static int terminate_machine(pid_t pid) {
1320 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1321 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1322 _cleanup_bus_unref_ sd_bus *bus = NULL;
1329 r = sd_bus_default_system(&bus);
1331 log_error("Failed to open system bus: %s", strerror(-r));
1335 r = sd_bus_call_method(
1337 "org.freedesktop.machine1",
1338 "/org/freedesktop/machine1",
1339 "org.freedesktop.machine1.Manager",
1346 /* Note that the machine might already have been
1347 * cleaned up automatically, hence don't consider it a
1348 * failure if we cannot get the machine object. */
1349 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1353 r = sd_bus_message_read(reply, "o", &path);
1355 return bus_log_parse_error(r);
1357 r = sd_bus_call_method(
1359 "org.freedesktop.machine1",
1361 "org.freedesktop.machine1.Machine",
1367 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1374 static int reset_audit_loginuid(void) {
1375 _cleanup_free_ char *p = NULL;
1378 if (arg_share_system)
1381 r = read_one_line_file("/proc/self/loginuid", &p);
1385 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1389 /* Already reset? */
1390 if (streq(p, "4294967295"))
1393 r = write_string_file("/proc/self/loginuid", "4294967295");
1395 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1396 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1397 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1398 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1399 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1407 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1409 static int get_mac(struct ether_addr *mac) {
1416 l = strlen(arg_machine);
1417 sz = sizeof(sd_id128_t) + l;
1420 /* fetch some persistent data unique to the host */
1421 r = sd_id128_get_machine((sd_id128_t*) v);
1425 /* combine with some data unique (on this host) to this
1426 * container instance */
1427 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1429 /* Let's hash the host machine ID plus the container name. We
1430 * use a fixed, but originally randomly created hash key here. */
1431 siphash24(result, v, sz, HASH_KEY.bytes);
1433 assert_cc(ETH_ALEN <= sizeof(result));
1434 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1436 /* see eth_random_addr in the kernel */
1437 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1438 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1443 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1444 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1445 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1446 struct ether_addr mac;
1449 if (!arg_private_network)
1452 if (!arg_network_veth)
1455 /* Use two different interface name prefixes depending whether
1456 * we are in bridge mode or not. */
1457 if (arg_network_bridge)
1458 memcpy(iface_name, "vb-", 3);
1460 memcpy(iface_name, "ve-", 3);
1461 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1465 log_error("Failed to generate predictable MAC address for host0");
1469 r = sd_rtnl_open(&rtnl, 0);
1471 log_error("Failed to connect to netlink: %s", strerror(-r));
1475 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1477 log_error("Failed to allocate netlink message: %s", strerror(-r));
1481 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1483 log_error("Failed to add netlink interface name: %s", strerror(-r));
1487 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1489 log_error("Failed to open netlink container: %s", strerror(-r));
1493 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1495 log_error("Failed to open netlink container: %s", strerror(-r));
1499 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1501 log_error("Failed to open netlink container: %s", strerror(-r));
1505 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1507 log_error("Failed to add netlink interface name: %s", strerror(-r));
1511 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1513 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1517 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1519 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1523 r = sd_rtnl_message_close_container(m);
1525 log_error("Failed to close netlink container: %s", strerror(-r));
1529 r = sd_rtnl_message_close_container(m);
1531 log_error("Failed to close netlink container: %s", strerror(-r));
1535 r = sd_rtnl_message_close_container(m);
1537 log_error("Failed to close netlink container: %s", strerror(-r));
1541 r = sd_rtnl_call(rtnl, m, 0, NULL);
1543 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1550 static int setup_bridge(const char veth_name[]) {
1551 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1552 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1555 if (!arg_private_network)
1558 if (!arg_network_veth)
1561 if (!arg_network_bridge)
1564 bridge = (int) if_nametoindex(arg_network_bridge);
1566 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1570 r = sd_rtnl_open(&rtnl, 0);
1572 log_error("Failed to connect to netlink: %s", strerror(-r));
1576 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1578 log_error("Failed to allocate netlink message: %s", strerror(-r));
1582 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1584 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1588 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1590 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1594 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1596 log_error("Failed to add netlink master field: %s", strerror(-r));
1600 r = sd_rtnl_call(rtnl, m, 0, NULL);
1602 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1609 static int parse_interface(struct udev *udev, const char *name) {
1610 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1611 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1614 ifi = (int) if_nametoindex(name);
1616 log_error("Failed to resolve interface %s: %m", name);
1620 sprintf(ifi_str, "n%i", ifi);
1621 d = udev_device_new_from_device_id(udev, ifi_str);
1623 log_error("Failed to get udev device for interface %s: %m", name);
1627 if (udev_device_get_is_initialized(d) <= 0) {
1628 log_error("Network interface %s is not initialized yet.", name);
1635 static int move_network_interfaces(pid_t pid) {
1636 _cleanup_udev_unref_ struct udev *udev = NULL;
1637 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1641 if (!arg_private_network)
1644 if (strv_isempty(arg_network_interfaces))
1647 r = sd_rtnl_open(&rtnl, 0);
1649 log_error("Failed to connect to netlink: %s", strerror(-r));
1655 log_error("Failed to connect to udev.");
1659 STRV_FOREACH(i, arg_network_interfaces) {
1660 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1663 ifi = parse_interface(udev, *i);
1667 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1669 log_error("Failed to allocate netlink message: %s", strerror(-r));
1673 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1675 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1679 r = sd_rtnl_call(rtnl, m, 0, NULL);
1681 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1689 static int setup_macvlan(pid_t pid) {
1690 _cleanup_udev_unref_ struct udev *udev = NULL;
1691 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1695 if (!arg_private_network)
1698 if (strv_isempty(arg_network_macvlan))
1701 r = sd_rtnl_open(&rtnl, 0);
1703 log_error("Failed to connect to netlink: %s", strerror(-r));
1709 log_error("Failed to connect to udev.");
1713 STRV_FOREACH(i, arg_network_macvlan) {
1714 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1715 _cleanup_free_ char *n = NULL;
1718 ifi = parse_interface(udev, *i);
1722 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1724 log_error("Failed to allocate netlink message: %s", strerror(-r));
1728 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1730 log_error("Failed to add netlink interface index: %s", strerror(-r));
1734 n = strappend("mv-", *i);
1738 strshorten(n, IFNAMSIZ-1);
1740 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1742 log_error("Failed to add netlink interface name: %s", strerror(-r));
1746 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1748 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1752 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1754 log_error("Failed to open netlink container: %s", strerror(-r));
1758 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1760 log_error("Failed to open netlink container: %s", strerror(-r));
1764 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1766 log_error("Failed to append macvlan mode: %s", strerror(-r));
1770 r = sd_rtnl_message_close_container(m);
1772 log_error("Failed to close netlink container: %s", strerror(-r));
1776 r = sd_rtnl_message_close_container(m);
1778 log_error("Failed to close netlink container: %s", strerror(-r));
1782 r = sd_rtnl_call(rtnl, m, 0, NULL);
1784 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1792 static int audit_still_doesnt_work_in_containers(void) {
1795 scmp_filter_ctx seccomp;
1799 Audit is broken in containers, much of the userspace audit
1800 hookup will fail if running inside a container. We don't
1801 care and just turn off creation of audit sockets.
1803 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1804 with EAFNOSUPPORT which audit userspace uses as indication
1805 that audit is disabled in the kernel.
1808 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1812 r = seccomp_add_secondary_archs(seccomp);
1814 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1818 r = seccomp_rule_add(
1820 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1823 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1824 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1826 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1830 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1832 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1836 r = seccomp_load(seccomp);
1838 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1841 seccomp_release(seccomp);
1849 static int setup_image(char **device_path, int *loop_nr) {
1850 struct loop_info64 info = {
1851 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1853 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1854 _cleanup_free_ char* loopdev = NULL;
1858 assert(device_path);
1861 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1863 log_error("Failed to open %s: %m", arg_image);
1867 if (fstat(fd, &st) < 0) {
1868 log_error("Failed to stat %s: %m", arg_image);
1872 if (S_ISBLK(st.st_mode)) {
1875 p = strdup(arg_image);
1889 if (!S_ISREG(st.st_mode)) {
1890 log_error("%s is not a regular file or block device: %m", arg_image);
1894 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1896 log_error("Failed to open /dev/loop-control: %m");
1900 nr = ioctl(control, LOOP_CTL_GET_FREE);
1902 log_error("Failed to allocate loop device: %m");
1906 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1909 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1911 log_error("Failed to open loop device %s: %m", loopdev);
1915 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1916 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1921 info.lo_flags |= LO_FLAGS_READ_ONLY;
1923 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1924 log_error("Failed to set loopback settings on %s: %m", loopdev);
1928 *device_path = loopdev;
1939 static int dissect_image(
1941 char **root_device, bool *root_device_rw,
1942 char **home_device, bool *home_device_rw,
1943 char **srv_device, bool *srv_device_rw,
1947 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1948 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1949 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1950 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1951 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1952 _cleanup_udev_unref_ struct udev *udev = NULL;
1953 struct udev_list_entry *first, *item;
1954 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1955 const char *pttype = NULL;
1961 assert(root_device);
1962 assert(home_device);
1966 b = blkid_new_probe();
1971 r = blkid_probe_set_device(b, fd, 0, 0);
1976 log_error("Failed to set device on blkid probe: %m");
1980 blkid_probe_enable_partitions(b, 1);
1981 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1984 r = blkid_do_safeprobe(b);
1985 if (r == -2 || r == 1) {
1986 log_error("Failed to identify any partition table on %s.\n"
1987 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1989 } else if (r != 0) {
1992 log_error("Failed to probe: %m");
1996 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1997 if (!streq_ptr(pttype, "gpt")) {
1998 log_error("Image %s does not carry a GUID Partition Table.\n"
1999 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2004 pl = blkid_probe_get_partitions(b);
2009 log_error("Failed to list partitions of %s", arg_image);
2017 if (fstat(fd, &st) < 0) {
2018 log_error("Failed to stat block device: %m");
2022 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2026 e = udev_enumerate_new(udev);
2030 r = udev_enumerate_add_match_parent(e, d);
2034 r = udev_enumerate_scan_devices(e);
2036 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2040 first = udev_enumerate_get_list_entry(e);
2041 udev_list_entry_foreach(item, first) {
2042 _cleanup_udev_device_unref_ struct udev_device *q;
2043 const char *stype, *node;
2044 unsigned long long flags;
2051 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2056 log_error("Failed to get partition device of %s: %m", arg_image);
2060 qn = udev_device_get_devnum(q);
2064 if (st.st_rdev == qn)
2067 node = udev_device_get_devnode(q);
2071 pp = blkid_partlist_devno_to_partition(pl, qn);
2075 flags = blkid_partition_get_flags(pp);
2076 if (flags & GPT_FLAG_NO_AUTO)
2079 nr = blkid_partition_get_partno(pp);
2083 stype = blkid_partition_get_type_string(pp);
2087 if (sd_id128_from_string(stype, &type_id) < 0)
2090 if (sd_id128_equal(type_id, GPT_HOME)) {
2092 if (home && nr >= home_nr)
2096 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2099 home = strdup(node);
2102 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2104 if (srv && nr >= srv_nr)
2108 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2115 #ifdef GPT_ROOT_NATIVE
2116 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2118 if (root && nr >= root_nr)
2122 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2125 root = strdup(node);
2130 #ifdef GPT_ROOT_SECONDARY
2131 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2133 if (secondary_root && nr >= secondary_root_nr)
2136 secondary_root_nr = nr;
2137 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2140 free(secondary_root);
2141 secondary_root = strdup(node);
2142 if (!secondary_root)
2148 if (!root && !secondary_root) {
2149 log_error("Failed to identify root partition in disk image %s.\n"
2150 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2155 *root_device = root;
2158 *root_device_rw = root_rw;
2160 } else if (secondary_root) {
2161 *root_device = secondary_root;
2162 secondary_root = NULL;
2164 *root_device_rw = secondary_root_rw;
2169 *home_device = home;
2172 *home_device_rw = home_rw;
2179 *srv_device_rw = srv_rw;
2184 log_error("--image= is not supported, compiled without blkid support.");
2189 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2191 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2192 const char *fstype, *p;
2202 p = strappenda(where, directory);
2207 b = blkid_new_probe_from_filename(what);
2211 log_error("Failed to allocate prober for %s: %m", what);
2215 blkid_probe_enable_superblocks(b, 1);
2216 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2219 r = blkid_do_safeprobe(b);
2220 if (r == -1 || r == 1) {
2221 log_error("Cannot determine file system type of %s", what);
2223 } else if (r != 0) {
2226 log_error("Failed to probe %s: %m", what);
2231 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2234 log_error("Failed to determine file system type of %s", what);
2238 if (streq(fstype, "crypto_LUKS")) {
2239 log_error("nspawn currently does not support LUKS disk images.");
2243 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2244 log_error("Failed to mount %s: %m", what);
2250 log_error("--image= is not supported, compiled without blkid support.");
2255 static int mount_devices(
2257 const char *root_device, bool root_device_rw,
2258 const char *home_device, bool home_device_rw,
2259 const char *srv_device, bool srv_device_rw) {
2265 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2267 log_error("Failed to mount root directory: %s", strerror(-r));
2273 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2275 log_error("Failed to mount home directory: %s", strerror(-r));
2281 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2283 log_error("Failed to mount server data directory: %s", strerror(-r));
2291 static void loop_remove(int nr, int *image_fd) {
2292 _cleanup_close_ int control = -1;
2297 if (image_fd && *image_fd >= 0) {
2298 ioctl(*image_fd, LOOP_CLR_FD);
2299 *image_fd = safe_close(*image_fd);
2302 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2306 ioctl(control, LOOP_CTL_REMOVE, nr);
2309 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2317 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2318 log_error("Failed to allocate pipe: %m");
2324 log_error("Failed to fork getent child: %m");
2326 } else if (pid == 0) {
2328 char *empty_env = NULL;
2330 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2331 _exit(EXIT_FAILURE);
2333 if (pipe_fds[0] > 2)
2334 safe_close(pipe_fds[0]);
2335 if (pipe_fds[1] > 2)
2336 safe_close(pipe_fds[1]);
2338 nullfd = open("/dev/null", O_RDWR);
2340 _exit(EXIT_FAILURE);
2342 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2343 _exit(EXIT_FAILURE);
2345 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2346 _exit(EXIT_FAILURE);
2351 reset_all_signal_handlers();
2352 close_all_fds(NULL, 0);
2354 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2355 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2356 _exit(EXIT_FAILURE);
2359 pipe_fds[1] = safe_close(pipe_fds[1]);
2366 static int change_uid_gid(char **_home) {
2367 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2368 _cleanup_free_ uid_t *uids = NULL;
2369 _cleanup_free_ char *home = NULL;
2370 _cleanup_fclose_ FILE *f = NULL;
2371 _cleanup_close_ int fd = -1;
2372 unsigned n_uids = 0;
2381 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2382 /* Reset everything fully to 0, just in case */
2384 if (setgroups(0, NULL) < 0) {
2385 log_error("setgroups() failed: %m");
2389 if (setresgid(0, 0, 0) < 0) {
2390 log_error("setregid() failed: %m");
2394 if (setresuid(0, 0, 0) < 0) {
2395 log_error("setreuid() failed: %m");
2403 /* First, get user credentials */
2404 fd = spawn_getent("passwd", arg_user, &pid);
2408 f = fdopen(fd, "r");
2413 if (!fgets(line, sizeof(line), f)) {
2416 log_error("Failed to resolve user %s.", arg_user);
2420 log_error("Failed to read from getent: %m");
2426 wait_for_terminate_and_warn("getent passwd", pid);
2428 x = strchr(line, ':');
2430 log_error("/etc/passwd entry has invalid user field.");
2434 u = strchr(x+1, ':');
2436 log_error("/etc/passwd entry has invalid password field.");
2443 log_error("/etc/passwd entry has invalid UID field.");
2451 log_error("/etc/passwd entry has invalid GID field.");
2456 h = strchr(x+1, ':');
2458 log_error("/etc/passwd entry has invalid GECOS field.");
2465 log_error("/etc/passwd entry has invalid home directory field.");
2471 r = parse_uid(u, &uid);
2473 log_error("Failed to parse UID of user.");
2477 r = parse_gid(g, &gid);
2479 log_error("Failed to parse GID of user.");
2487 /* Second, get group memberships */
2488 fd = spawn_getent("initgroups", arg_user, &pid);
2493 f = fdopen(fd, "r");
2498 if (!fgets(line, sizeof(line), f)) {
2500 log_error("Failed to resolve user %s.", arg_user);
2504 log_error("Failed to read from getent: %m");
2510 wait_for_terminate_and_warn("getent initgroups", pid);
2512 /* Skip over the username and subsequent separator whitespace */
2514 x += strcspn(x, WHITESPACE);
2515 x += strspn(x, WHITESPACE);
2517 FOREACH_WORD(w, l, x, state) {
2523 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2526 r = parse_uid(c, &uids[n_uids++]);
2528 log_error("Failed to parse group data from getent.");
2533 r = mkdir_parents(home, 0775);
2535 log_error("Failed to make home root directory: %s", strerror(-r));
2539 r = mkdir_safe(home, 0755, uid, gid);
2540 if (r < 0 && r != -EEXIST) {
2541 log_error("Failed to make home directory: %s", strerror(-r));
2545 fchown(STDIN_FILENO, uid, gid);
2546 fchown(STDOUT_FILENO, uid, gid);
2547 fchown(STDERR_FILENO, uid, gid);
2549 if (setgroups(n_uids, uids) < 0) {
2550 log_error("Failed to set auxiliary groups: %m");
2554 if (setresgid(gid, gid, gid) < 0) {
2555 log_error("setregid() failed: %m");
2559 if (setresuid(uid, uid, uid) < 0) {
2560 log_error("setreuid() failed: %m");
2572 int main(int argc, char *argv[]) {
2574 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2575 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2576 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2577 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2578 _cleanup_fdset_free_ FDSet *fds = NULL;
2579 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2580 const char *console = NULL;
2581 char veth_name[IFNAMSIZ];
2582 bool secondary = false;
2586 log_parse_environment();
2589 k = parse_argv(argc, argv);
2598 if (arg_directory) {
2601 p = path_make_absolute_cwd(arg_directory);
2602 free(arg_directory);
2605 arg_directory = get_current_dir_name();
2607 if (!arg_directory) {
2608 log_error("Failed to determine path, please use -D.");
2611 path_kill_slashes(arg_directory);
2615 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2621 hostname_cleanup(arg_machine, false);
2622 if (isempty(arg_machine)) {
2623 log_error("Failed to determine machine name automatically, please use -M.");
2628 if (geteuid() != 0) {
2629 log_error("Need to be root.");
2633 if (sd_booted() <= 0) {
2634 log_error("Not running on a systemd system.");
2639 n_fd_passed = sd_listen_fds(false);
2640 if (n_fd_passed > 0) {
2641 k = fdset_new_listen_fds(&fds, false);
2643 log_error("Failed to collect file descriptors: %s", strerror(-k));
2647 fdset_close_others(fds);
2650 if (arg_directory) {
2651 if (path_equal(arg_directory, "/")) {
2652 log_error("Spawning container on root directory not supported.");
2657 if (path_is_os_tree(arg_directory) <= 0) {
2658 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2664 p = strappenda(arg_directory,
2665 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2666 if (access(p, F_OK) < 0) {
2667 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2673 char template[] = "/tmp/nspawn-root-XXXXXX";
2675 if (!mkdtemp(template)) {
2676 log_error("Failed to create temporary directory: %m");
2681 arg_directory = strdup(template);
2682 if (!arg_directory) {
2687 image_fd = setup_image(&device_path, &loop_nr);
2693 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2698 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2700 log_error("Failed to acquire pseudo tty: %m");
2704 console = ptsname(master);
2706 log_error("Failed to determine tty name: %m");
2711 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2713 if (unlockpt(master) < 0) {
2714 log_error("Failed to unlock tty: %m");
2718 if (access("/dev/kdbus/control", F_OK) >= 0) {
2720 if (arg_share_system) {
2721 kdbus_domain = strdup("/dev/kdbus");
2722 if (!kdbus_domain) {
2729 ns = strappenda("machine-", arg_machine);
2730 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2732 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2734 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2738 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2739 log_error("Failed to create kmsg socket pair: %m");
2743 sd_notify(0, "READY=1");
2745 assert_se(sigemptyset(&mask) == 0);
2746 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2747 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2750 int parent_ready_fd = -1, child_ready_fd = -1;
2754 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2755 if (parent_ready_fd < 0) {
2756 log_error("Failed to create event fd: %m");
2760 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2761 if (child_ready_fd < 0) {
2762 log_error("Failed to create event fd: %m");
2766 pid = syscall(__NR_clone,
2767 SIGCHLD|CLONE_NEWNS|
2768 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2769 (arg_private_network ? CLONE_NEWNET : 0), NULL);
2771 if (errno == EINVAL)
2772 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2774 log_error("clone() failed: %m");
2781 _cleanup_free_ char *home = NULL;
2783 const char *envp[] = {
2784 "PATH=" DEFAULT_PATH_SPLIT_USR,
2785 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2790 NULL, /* container_uuid */
2791 NULL, /* LISTEN_FDS */
2792 NULL, /* LISTEN_PID */
2797 envp[n_env] = strv_find_prefix(environ, "TERM=");
2801 master = safe_close(master);
2803 close_nointr(STDIN_FILENO);
2804 close_nointr(STDOUT_FILENO);
2805 close_nointr(STDERR_FILENO);
2807 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2809 reset_all_signal_handlers();
2811 assert_se(sigemptyset(&mask) == 0);
2812 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2814 k = open_terminal(console, O_RDWR);
2815 if (k != STDIN_FILENO) {
2821 log_error("Failed to open console: %s", strerror(-k));
2825 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2826 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2827 log_error("Failed to duplicate console: %m");
2832 log_error("setsid() failed: %m");
2836 if (reset_audit_loginuid() < 0)
2839 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2840 log_error("PR_SET_PDEATHSIG failed: %m");
2844 /* Mark everything as slave, so that we still
2845 * receive mounts from the real root, but don't
2846 * propagate mounts to the real root. */
2847 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2848 log_error("MS_SLAVE|MS_REC failed: %m");
2852 if (mount_devices(arg_directory,
2853 root_device, root_device_rw,
2854 home_device, home_device_rw,
2855 srv_device, srv_device_rw) < 0)
2858 /* Turn directory into bind mount */
2859 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2860 log_error("Failed to make bind mount.");
2865 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2866 log_error("Failed to make read-only.");
2870 if (mount_all(arg_directory) < 0)
2873 if (copy_devnodes(arg_directory) < 0)
2876 if (setup_ptmx(arg_directory) < 0)
2879 dev_setup(arg_directory);
2881 if (audit_still_doesnt_work_in_containers() < 0)
2884 if (setup_dev_console(arg_directory, console) < 0)
2887 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2890 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2892 if (setup_boot_id(arg_directory) < 0)
2895 if (setup_timezone(arg_directory) < 0)
2898 if (setup_resolv_conf(arg_directory) < 0)
2901 if (setup_journal(arg_directory) < 0)
2904 if (mount_binds(arg_directory, arg_bind, 0) < 0)
2907 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2910 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2913 /* Tell the parent that we are ready, and that
2914 * it can cgroupify us to that we lack access
2915 * to certain devices and resources. */
2916 eventfd_write(child_ready_fd, 1);
2917 child_ready_fd = safe_close(child_ready_fd);
2919 if (chdir(arg_directory) < 0) {
2920 log_error("chdir(%s) failed: %m", arg_directory);
2924 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2925 log_error("mount(MS_MOVE) failed: %m");
2929 if (chroot(".") < 0) {
2930 log_error("chroot() failed: %m");
2934 if (chdir("/") < 0) {
2935 log_error("chdir() failed: %m");
2941 if (arg_private_network)
2944 if (drop_capabilities() < 0) {
2945 log_error("drop_capabilities() failed: %m");
2949 r = change_uid_gid(&home);
2953 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2954 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2955 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2960 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2963 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
2969 if (fdset_size(fds) > 0) {
2970 k = fdset_cloexec(fds, false);
2972 log_error("Failed to unset O_CLOEXEC for file descriptors.");
2976 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2977 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2985 if (arg_personality != 0xffffffffLU) {
2986 if (personality(arg_personality) < 0) {
2987 log_error("personality() failed: %m");
2990 } else if (secondary) {
2991 if (personality(PER_LINUX32) < 0) {
2992 log_error("personality() failed: %m");
2998 if (arg_selinux_context)
2999 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3000 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3005 if (!strv_isempty(arg_setenv)) {
3008 n = strv_env_merge(2, envp, arg_setenv);
3016 env_use = (char**) envp;
3018 /* Wait until the parent is ready with the setup, too... */
3019 eventfd_read(parent_ready_fd, &x);
3020 parent_ready_fd = safe_close(parent_ready_fd);
3026 /* Automatically search for the init system */
3028 l = 1 + argc - optind;
3029 a = newa(char*, l + 1);
3030 memcpy(a + 1, argv + optind, l * sizeof(char*));
3032 a[0] = (char*) "/usr/lib/systemd/systemd";
3033 execve(a[0], a, env_use);
3035 a[0] = (char*) "/lib/systemd/systemd";
3036 execve(a[0], a, env_use);
3038 a[0] = (char*) "/sbin/init";
3039 execve(a[0], a, env_use);
3040 } else if (argc > optind)
3041 execvpe(argv[optind], argv + optind, env_use);
3043 chdir(home ? home : "/root");
3044 execle("/bin/bash", "-bash", NULL, env_use);
3045 execle("/bin/sh", "-sh", NULL, env_use);
3048 log_error("execv() failed: %m");
3051 _exit(EXIT_FAILURE);
3057 /* Wait until the child reported that it is ready with
3058 * all it needs to do with privileges. After we got
3059 * the notification we can make the process join its
3060 * cgroup which might limit what it can do */
3061 eventfd_read(child_ready_fd, &x);
3063 r = register_machine(pid);
3067 r = move_network_interfaces(pid);
3071 r = setup_veth(pid, veth_name);
3075 r = setup_bridge(veth_name);
3079 r = setup_macvlan(pid);
3083 /* Notify the child that the parent is ready with all
3084 * its setup, and thtat the child can now hand over
3085 * control to the code to run inside the container. */
3086 eventfd_write(parent_ready_fd, 1);
3088 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3097 /* Kill if it is not dead yet anyway */
3098 terminate_machine(pid);
3100 /* Redundant, but better safe than sorry */
3103 k = wait_for_terminate(pid, &status);
3111 if (status.si_code == CLD_EXITED) {
3112 r = status.si_status;
3113 if (status.si_status != 0) {
3114 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3119 log_debug("Container %s exited successfully.", arg_machine);
3121 } else if (status.si_code == CLD_KILLED &&
3122 status.si_status == SIGINT) {
3125 log_info("Container %s has been shut down.", arg_machine);
3128 } else if (status.si_code == CLD_KILLED &&
3129 status.si_status == SIGHUP) {
3132 log_info("Container %s is being rebooted.", arg_machine);
3134 } else if (status.si_code == CLD_KILLED ||
3135 status.si_code == CLD_DUMPED) {
3137 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3141 log_error("Container %s failed due to unknown reason.", arg_machine);
3148 loop_remove(loop_nr, &image_fd);
3153 free(arg_directory);
3156 strv_free(arg_setenv);
3157 strv_free(arg_network_interfaces);
3158 strv_free(arg_network_macvlan);
3159 strv_free(arg_bind);
3160 strv_free(arg_bind_ro);