1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
90 #include "siphash24.h"
93 #include "seccomp-util.h"
96 typedef enum ContainerStatus {
101 typedef enum LinkJournal {
108 static char *arg_directory = NULL;
109 static char *arg_user = NULL;
110 static sd_id128_t arg_uuid = {};
111 static char *arg_machine = NULL;
112 static const char *arg_selinux_context = NULL;
113 static const char *arg_selinux_apifs_context = NULL;
114 static const char *arg_slice = NULL;
115 static bool arg_private_network = false;
116 static bool arg_read_only = false;
117 static bool arg_boot = false;
118 static LinkJournal arg_link_journal = LINK_AUTO;
119 static uint64_t arg_retain =
120 (1ULL << CAP_CHOWN) |
121 (1ULL << CAP_DAC_OVERRIDE) |
122 (1ULL << CAP_DAC_READ_SEARCH) |
123 (1ULL << CAP_FOWNER) |
124 (1ULL << CAP_FSETID) |
125 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_LEASE) |
128 (1ULL << CAP_LINUX_IMMUTABLE) |
129 (1ULL << CAP_NET_BIND_SERVICE) |
130 (1ULL << CAP_NET_BROADCAST) |
131 (1ULL << CAP_NET_RAW) |
132 (1ULL << CAP_SETGID) |
133 (1ULL << CAP_SETFCAP) |
134 (1ULL << CAP_SETPCAP) |
135 (1ULL << CAP_SETUID) |
136 (1ULL << CAP_SYS_ADMIN) |
137 (1ULL << CAP_SYS_CHROOT) |
138 (1ULL << CAP_SYS_NICE) |
139 (1ULL << CAP_SYS_PTRACE) |
140 (1ULL << CAP_SYS_TTY_CONFIG) |
141 (1ULL << CAP_SYS_RESOURCE) |
142 (1ULL << CAP_SYS_BOOT) |
143 (1ULL << CAP_AUDIT_WRITE) |
144 (1ULL << CAP_AUDIT_CONTROL) |
146 static char **arg_bind = NULL;
147 static char **arg_bind_ro = NULL;
148 static char **arg_setenv = NULL;
149 static bool arg_quiet = false;
150 static bool arg_share_system = false;
151 static bool arg_register = true;
152 static bool arg_keep_unit = false;
153 static char **arg_network_interfaces = NULL;
154 static char **arg_network_macvlan = NULL;
155 static bool arg_network_veth = false;
156 static const char *arg_network_bridge = NULL;
157 static unsigned long arg_personality = 0xffffffffLU;
158 static const char *arg_image = NULL;
160 static int help(void) {
162 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
163 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
164 " -h --help Show this help\n"
165 " --version Print version string\n"
166 " -q --quiet Do not show status information\n"
167 " -D --directory=PATH Root directory for the container\n"
168 " -i --image=PATH File system device or image for the container\n"
169 " -b --boot Boot up full system (i.e. invoke init)\n"
170 " -u --user=USER Run the command under specified user or uid\n"
171 " -M --machine=NAME Set the machine name for the container\n"
172 " --uuid=UUID Set a specific machine UUID for the container\n"
173 " -S --slice=SLICE Place the container in the specified slice\n"
174 " --private-network Disable network in container\n"
175 " --network-interface=INTERFACE\n"
176 " Assign an existing network interface to the\n"
178 " --network-macvlan=INTERFACE\n"
179 " Create a macvlan network interface based on an\n"
180 " existing network interface to the container\n"
181 " --network-veth Add a virtual ethernet connection between host\n"
183 " --network-bridge=INTERFACE\n"
184 " Add a virtual ethernet connection between host\n"
185 " and container and add it to an existing bridge on\n"
187 " -Z --selinux-context=SECLABEL\n"
188 " Set the SELinux security context to be used by\n"
189 " processes in the container\n"
190 " -L --selinux-apifs-context=SECLABEL\n"
191 " Set the SELinux security context to be used by\n"
192 " API/tmpfs file systems in the container\n"
193 " --capability=CAP In addition to the default, retain specified\n"
195 " --drop-capability=CAP Drop the specified capability from the default set\n"
196 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
197 " -j Equivalent to --link-journal=host\n"
198 " --read-only Mount the root directory read-only\n"
199 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
201 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
202 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
203 " --share-system Share system namespaces with host\n"
204 " --register=BOOLEAN Register container as machine\n"
205 " --keep-unit Do not register a scope for the machine, reuse\n"
206 " the service unit nspawn is running in\n",
207 program_invocation_short_name);
212 static int parse_argv(int argc, char *argv[]) {
228 ARG_NETWORK_INTERFACE,
235 static const struct option options[] = {
236 { "help", no_argument, NULL, 'h' },
237 { "version", no_argument, NULL, ARG_VERSION },
238 { "directory", required_argument, NULL, 'D' },
239 { "user", required_argument, NULL, 'u' },
240 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
241 { "boot", no_argument, NULL, 'b' },
242 { "uuid", required_argument, NULL, ARG_UUID },
243 { "read-only", no_argument, NULL, ARG_READ_ONLY },
244 { "capability", required_argument, NULL, ARG_CAPABILITY },
245 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
246 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
247 { "bind", required_argument, NULL, ARG_BIND },
248 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
249 { "machine", required_argument, NULL, 'M' },
250 { "slice", required_argument, NULL, 'S' },
251 { "setenv", required_argument, NULL, ARG_SETENV },
252 { "selinux-context", required_argument, NULL, 'Z' },
253 { "selinux-apifs-context", required_argument, NULL, 'L' },
254 { "quiet", no_argument, NULL, 'q' },
255 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
256 { "register", required_argument, NULL, ARG_REGISTER },
257 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
258 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
259 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
260 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
261 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
262 { "personality", required_argument, NULL, ARG_PERSONALITY },
263 { "image", required_argument, NULL, 'i' },
268 uint64_t plus = 0, minus = 0;
273 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
281 puts(PACKAGE_STRING);
282 puts(SYSTEMD_FEATURES);
287 arg_directory = canonicalize_file_name(optarg);
288 if (!arg_directory) {
289 log_error("Invalid root directory: %m");
301 arg_user = strdup(optarg);
307 case ARG_NETWORK_BRIDGE:
308 arg_network_bridge = optarg;
312 case ARG_NETWORK_VETH:
313 arg_network_veth = true;
314 arg_private_network = true;
317 case ARG_NETWORK_INTERFACE:
318 if (strv_extend(&arg_network_interfaces, optarg) < 0)
321 arg_private_network = true;
324 case ARG_NETWORK_MACVLAN:
325 if (strv_extend(&arg_network_macvlan, optarg) < 0)
330 case ARG_PRIVATE_NETWORK:
331 arg_private_network = true;
339 r = sd_id128_from_string(optarg, &arg_uuid);
341 log_error("Invalid UUID: %s", optarg);
351 if (isempty(optarg)) {
356 if (!hostname_is_valid(optarg)) {
357 log_error("Invalid machine name: %s", optarg);
362 arg_machine = strdup(optarg);
370 arg_selinux_context = optarg;
374 arg_selinux_apifs_context = optarg;
378 arg_read_only = true;
382 case ARG_DROP_CAPABILITY: {
386 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
387 _cleanup_free_ char *t;
390 t = strndup(word, length);
394 if (streq(t, "all")) {
395 if (c == ARG_CAPABILITY)
396 plus = (uint64_t) -1;
398 minus = (uint64_t) -1;
400 if (cap_from_name(t, &cap) < 0) {
401 log_error("Failed to parse capability %s.", t);
405 if (c == ARG_CAPABILITY)
406 plus |= 1ULL << (uint64_t) cap;
408 minus |= 1ULL << (uint64_t) cap;
416 arg_link_journal = LINK_GUEST;
419 case ARG_LINK_JOURNAL:
420 if (streq(optarg, "auto"))
421 arg_link_journal = LINK_AUTO;
422 else if (streq(optarg, "no"))
423 arg_link_journal = LINK_NO;
424 else if (streq(optarg, "guest"))
425 arg_link_journal = LINK_GUEST;
426 else if (streq(optarg, "host"))
427 arg_link_journal = LINK_HOST;
429 log_error("Failed to parse link journal mode %s", optarg);
437 _cleanup_free_ char *a = NULL, *b = NULL;
441 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
443 e = strchr(optarg, ':');
445 a = strndup(optarg, e - optarg);
455 if (!path_is_absolute(a) || !path_is_absolute(b)) {
456 log_error("Invalid bind mount specification: %s", optarg);
460 r = strv_extend(x, a);
464 r = strv_extend(x, b);
474 if (!env_assignment_is_valid(optarg)) {
475 log_error("Environment variable assignment '%s' is not valid.", optarg);
479 n = strv_env_set(arg_setenv, optarg);
483 strv_free(arg_setenv);
492 case ARG_SHARE_SYSTEM:
493 arg_share_system = true;
497 r = parse_boolean(optarg);
499 log_error("Failed to parse --register= argument: %s", optarg);
507 arg_keep_unit = true;
510 case ARG_PERSONALITY:
512 arg_personality = personality_from_string(optarg);
513 if (arg_personality == 0xffffffffLU) {
514 log_error("Unknown or unsupported personality '%s'.", optarg);
524 assert_not_reached("Unhandled option");
528 if (arg_share_system)
529 arg_register = false;
531 if (arg_boot && arg_share_system) {
532 log_error("--boot and --share-system may not be combined.");
536 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
537 log_error("--keep-unit may not be used when invoked from a user session.");
541 if (arg_directory && arg_image) {
542 log_error("--directory= and --image= may not be combined.");
546 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
551 static int mount_all(const char *dest) {
553 typedef struct MountPoint {
562 static const MountPoint mount_table[] = {
563 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
564 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
565 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
566 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
567 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
568 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
569 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
570 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
572 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
573 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
580 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
581 _cleanup_free_ char *where = NULL;
583 _cleanup_free_ char *options = NULL;
588 where = strjoin(dest, "/", mount_table[k].where, NULL);
592 t = path_is_mount_point(where, true);
594 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
602 /* Skip this entry if it is not a remount. */
603 if (mount_table[k].what && t > 0)
606 mkdir_p(where, 0755);
609 if (arg_selinux_apifs_context &&
610 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
611 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
618 o = mount_table[k].options;
621 if (mount(mount_table[k].what,
624 mount_table[k].flags,
626 mount_table[k].fatal) {
628 log_error("mount(%s) failed: %m", where);
638 static int mount_binds(const char *dest, char **l, bool ro) {
641 STRV_FOREACH_PAIR(x, y, l) {
643 struct stat source_st, dest_st;
646 if (stat(*x, &source_st) < 0) {
647 log_error("Failed to stat %s: %m", *x);
651 where = strappenda(dest, *y);
652 r = stat(where, &dest_st);
654 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
655 log_error("The file types of %s and %s do not match. Refusing bind mount",
659 } else if (errno == ENOENT) {
660 r = mkdir_parents_label(where, 0755);
662 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
666 log_error("Failed to bind mount %s: %m", *x);
669 /* Create the mount point, but be conservative -- refuse to create block
670 * and char devices. */
671 if (S_ISDIR(source_st.st_mode))
672 mkdir_label(where, 0755);
673 else if (S_ISFIFO(source_st.st_mode))
675 else if (S_ISSOCK(source_st.st_mode))
676 mknod(where, 0644 | S_IFSOCK, 0);
677 else if (S_ISREG(source_st.st_mode))
680 log_error("Refusing to create mountpoint for file: %s", *x);
684 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
685 log_error("mount(%s) failed: %m", where);
690 r = bind_remount_recursive(where, true);
692 log_error("Read-Only bind mount failed: %s", strerror(-r));
701 static int setup_timezone(const char *dest) {
702 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
708 /* Fix the timezone, if possible */
709 r = readlink_malloc("/etc/localtime", &p);
711 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
715 z = path_startswith(p, "../usr/share/zoneinfo/");
717 z = path_startswith(p, "/usr/share/zoneinfo/");
719 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
723 where = strappend(dest, "/etc/localtime");
727 r = readlink_malloc(where, &q);
729 y = path_startswith(q, "../usr/share/zoneinfo/");
731 y = path_startswith(q, "/usr/share/zoneinfo/");
734 /* Already pointing to the right place? Then do nothing .. */
735 if (y && streq(y, z))
739 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
743 if (access(check, F_OK) < 0) {
744 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
748 what = strappend("../usr/share/zoneinfo/", z);
753 if (symlink(what, where) < 0) {
754 log_error("Failed to correct timezone of container: %m");
761 static int setup_resolv_conf(const char *dest) {
762 char _cleanup_free_ *where = NULL;
766 if (arg_private_network)
769 /* Fix resolv.conf, if possible */
770 where = strappend(dest, "/etc/resolv.conf");
774 /* We don't really care for the results of this really. If it
775 * fails, it fails, but meh... */
776 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
781 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
784 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
785 SD_ID128_FORMAT_VAL(id));
790 static int setup_boot_id(const char *dest) {
791 _cleanup_free_ char *from = NULL, *to = NULL;
798 if (arg_share_system)
801 /* Generate a new randomized boot ID, so that each boot-up of
802 * the container gets a new one */
804 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
805 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
809 r = sd_id128_randomize(&rnd);
811 log_error("Failed to generate random boot id: %s", strerror(-r));
815 id128_format_as_uuid(rnd, as_uuid);
817 r = write_string_file(from, as_uuid);
819 log_error("Failed to write boot id: %s", strerror(-r));
823 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
824 log_error("Failed to bind mount boot id: %m");
826 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
827 log_warning("Failed to make boot id read-only: %m");
833 static int copy_devnodes(const char *dest) {
835 static const char devnodes[] =
845 _cleanup_umask_ mode_t u;
851 NULSTR_FOREACH(d, devnodes) {
852 _cleanup_free_ char *from = NULL, *to = NULL;
855 from = strappend("/dev/", d);
856 to = strjoin(dest, "/dev/", d, NULL);
860 if (stat(from, &st) < 0) {
862 if (errno != ENOENT) {
863 log_error("Failed to stat %s: %m", from);
867 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
869 log_error("%s is not a char or block device, cannot copy", from);
872 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
874 log_error("mknod(%s) failed: %m", dest);
882 static int setup_ptmx(const char *dest) {
883 _cleanup_free_ char *p = NULL;
885 p = strappend(dest, "/dev/ptmx");
889 if (symlink("pts/ptmx", p) < 0) {
890 log_error("Failed to create /dev/ptmx symlink: %m");
897 static int setup_dev_console(const char *dest, const char *console) {
898 _cleanup_umask_ mode_t u;
908 if (stat("/dev/null", &st) < 0) {
909 log_error("Failed to stat /dev/null: %m");
913 r = chmod_and_chown(console, 0600, 0, 0);
915 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
919 /* We need to bind mount the right tty to /dev/console since
920 * ptys can only exist on pts file systems. To have something
921 * to bind mount things on we create a device node first, and
922 * use /dev/null for that since we the cgroups device policy
923 * allows us to create that freely, while we cannot create
924 * /dev/console. (Note that the major minor doesn't actually
925 * matter here, since we mount it over anyway). */
927 to = strappenda(dest, "/dev/console");
928 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
929 log_error("mknod() for /dev/console failed: %m");
933 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
934 log_error("Bind mount for /dev/console failed: %m");
941 static int setup_kmsg(const char *dest, int kmsg_socket) {
942 _cleanup_free_ char *from = NULL, *to = NULL;
944 _cleanup_umask_ mode_t u;
946 struct cmsghdr cmsghdr;
947 uint8_t buf[CMSG_SPACE(sizeof(int))];
950 .msg_control = &control,
951 .msg_controllen = sizeof(control),
953 struct cmsghdr *cmsg;
956 assert(kmsg_socket >= 0);
960 /* We create the kmsg FIFO as /dev/kmsg, but immediately
961 * delete it after bind mounting it to /proc/kmsg. While FIFOs
962 * on the reading side behave very similar to /proc/kmsg,
963 * their writing side behaves differently from /dev/kmsg in
964 * that writing blocks when nothing is reading. In order to
965 * avoid any problems with containers deadlocking due to this
966 * we simply make /dev/kmsg unavailable to the container. */
967 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
968 asprintf(&to, "%s/proc/kmsg", dest) < 0)
971 if (mkfifo(from, 0600) < 0) {
972 log_error("mkfifo() for /dev/kmsg failed: %m");
976 r = chmod_and_chown(from, 0600, 0, 0);
978 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
982 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
983 log_error("Bind mount for /proc/kmsg failed: %m");
987 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
989 log_error("Failed to open fifo: %m");
993 cmsg = CMSG_FIRSTHDR(&mh);
994 cmsg->cmsg_level = SOL_SOCKET;
995 cmsg->cmsg_type = SCM_RIGHTS;
996 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
997 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
999 mh.msg_controllen = cmsg->cmsg_len;
1001 /* Store away the fd in the socket, so that it stays open as
1002 * long as we run the child */
1003 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1007 log_error("Failed to send FIFO fd: %m");
1011 /* And now make the FIFO unavailable as /dev/kmsg... */
1016 static int setup_hostname(void) {
1018 if (arg_share_system)
1021 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1027 static int setup_journal(const char *directory) {
1028 sd_id128_t machine_id, this_id;
1029 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1033 p = strappend(directory, "/etc/machine-id");
1037 r = read_one_line_file(p, &b);
1038 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1041 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1046 if (isempty(id) && arg_link_journal == LINK_AUTO)
1049 /* Verify validity */
1050 r = sd_id128_from_string(id, &machine_id);
1052 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1056 r = sd_id128_get_machine(&this_id);
1058 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1062 if (sd_id128_equal(machine_id, this_id)) {
1063 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1064 "Host and machine ids are equal (%s): refusing to link journals", id);
1065 if (arg_link_journal == LINK_AUTO)
1071 if (arg_link_journal == LINK_NO)
1075 p = strappend("/var/log/journal/", id);
1076 q = strjoin(directory, "/var/log/journal/", id, NULL);
1080 if (path_is_mount_point(p, false) > 0) {
1081 if (arg_link_journal != LINK_AUTO) {
1082 log_error("%s: already a mount point, refusing to use for journal", p);
1089 if (path_is_mount_point(q, false) > 0) {
1090 if (arg_link_journal != LINK_AUTO) {
1091 log_error("%s: already a mount point, refusing to use for journal", q);
1098 r = readlink_and_make_absolute(p, &d);
1100 if ((arg_link_journal == LINK_GUEST ||
1101 arg_link_journal == LINK_AUTO) &&
1104 r = mkdir_p(q, 0755);
1106 log_warning("failed to create directory %s: %m", q);
1110 if (unlink(p) < 0) {
1111 log_error("Failed to remove symlink %s: %m", p);
1114 } else if (r == -EINVAL) {
1116 if (arg_link_journal == LINK_GUEST &&
1119 if (errno == ENOTDIR) {
1120 log_error("%s already exists and is neither a symlink nor a directory", p);
1123 log_error("Failed to remove %s: %m", p);
1127 } else if (r != -ENOENT) {
1128 log_error("readlink(%s) failed: %m", p);
1132 if (arg_link_journal == LINK_GUEST) {
1134 if (symlink(q, p) < 0) {
1135 log_error("Failed to symlink %s to %s: %m", q, p);
1139 r = mkdir_p(q, 0755);
1141 log_warning("failed to create directory %s: %m", q);
1145 if (arg_link_journal == LINK_HOST) {
1146 r = mkdir_p(p, 0755);
1148 log_error("Failed to create %s: %m", p);
1152 } else if (access(p, F_OK) < 0)
1155 if (dir_is_empty(q) == 0)
1156 log_warning("%s is not empty, proceeding anyway.", q);
1158 r = mkdir_p(q, 0755);
1160 log_error("Failed to create %s: %m", q);
1164 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1165 log_error("Failed to bind mount journal from host into guest: %m");
1172 static int setup_kdbus(const char *dest, const char *path) {
1178 p = strappenda(dest, "/dev/kdbus");
1179 if (mkdir(p, 0755) < 0) {
1180 log_error("Failed to create kdbus path: %m");
1184 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1185 log_error("Failed to mount kdbus domain path: %m");
1192 static int drop_capabilities(void) {
1193 return capability_bounding_set_drop(~arg_retain, false);
1196 static int register_machine(pid_t pid) {
1197 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1198 _cleanup_bus_unref_ sd_bus *bus = NULL;
1204 r = sd_bus_default_system(&bus);
1206 log_error("Failed to open system bus: %s", strerror(-r));
1210 if (arg_keep_unit) {
1211 r = sd_bus_call_method(
1213 "org.freedesktop.machine1",
1214 "/org/freedesktop/machine1",
1215 "org.freedesktop.machine1.Manager",
1221 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1225 strempty(arg_directory));
1227 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1229 r = sd_bus_message_new_method_call(
1232 "org.freedesktop.machine1",
1233 "/org/freedesktop/machine1",
1234 "org.freedesktop.machine1.Manager",
1237 log_error("Failed to create message: %s", strerror(-r));
1241 r = sd_bus_message_append(
1245 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1249 strempty(arg_directory));
1251 log_error("Failed to append message arguments: %s", strerror(-r));
1255 r = sd_bus_message_open_container(m, 'a', "(sv)");
1257 log_error("Failed to open container: %s", strerror(-r));
1261 if (!isempty(arg_slice)) {
1262 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1264 log_error("Failed to append slice: %s", strerror(-r));
1269 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1271 log_error("Failed to add device policy: %s", strerror(-r));
1275 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1276 /* Allow the container to
1277 * access and create the API
1278 * device nodes, so that
1279 * PrivateDevices= in the
1280 * container can work
1285 "/dev/random", "rwm",
1286 "/dev/urandom", "rwm",
1288 /* Allow the container
1289 * access to ptys. However,
1291 * container to ever create
1292 * these device nodes. */
1293 "/dev/pts/ptmx", "rw",
1295 /* Allow the container
1296 * access to all kdbus
1297 * devices. Again, the
1298 * container cannot create
1299 * these nodes, only use
1300 * them. We use a pretty
1301 * open match here, so that
1302 * the kernel API can still
1305 "char-kdbus/*", "rw");
1307 log_error("Failed to add device whitelist: %s", strerror(-r));
1311 r = sd_bus_message_close_container(m);
1313 log_error("Failed to close container: %s", strerror(-r));
1317 r = sd_bus_call(bus, m, 0, &error, NULL);
1321 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1328 static int terminate_machine(pid_t pid) {
1329 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1330 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1331 _cleanup_bus_unref_ sd_bus *bus = NULL;
1338 r = sd_bus_default_system(&bus);
1340 log_error("Failed to open system bus: %s", strerror(-r));
1344 r = sd_bus_call_method(
1346 "org.freedesktop.machine1",
1347 "/org/freedesktop/machine1",
1348 "org.freedesktop.machine1.Manager",
1355 /* Note that the machine might already have been
1356 * cleaned up automatically, hence don't consider it a
1357 * failure if we cannot get the machine object. */
1358 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1362 r = sd_bus_message_read(reply, "o", &path);
1364 return bus_log_parse_error(r);
1366 r = sd_bus_call_method(
1368 "org.freedesktop.machine1",
1370 "org.freedesktop.machine1.Machine",
1376 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1383 static int reset_audit_loginuid(void) {
1384 _cleanup_free_ char *p = NULL;
1387 if (arg_share_system)
1390 r = read_one_line_file("/proc/self/loginuid", &p);
1394 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1398 /* Already reset? */
1399 if (streq(p, "4294967295"))
1402 r = write_string_file("/proc/self/loginuid", "4294967295");
1404 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1405 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1406 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1407 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1408 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1416 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1418 static int get_mac(struct ether_addr *mac) {
1425 l = strlen(arg_machine);
1426 sz = sizeof(sd_id128_t) + l;
1429 /* fetch some persistent data unique to the host */
1430 r = sd_id128_get_machine((sd_id128_t*) v);
1434 /* combine with some data unique (on this host) to this
1435 * container instance */
1436 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1438 /* Let's hash the host machine ID plus the container name. We
1439 * use a fixed, but originally randomly created hash key here. */
1440 siphash24(result, v, sz, HASH_KEY.bytes);
1442 assert_cc(ETH_ALEN <= sizeof(result));
1443 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1445 /* see eth_random_addr in the kernel */
1446 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1447 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1452 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1453 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1454 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1455 struct ether_addr mac;
1458 if (!arg_private_network)
1461 if (!arg_network_veth)
1464 /* Use two different interface name prefixes depending whether
1465 * we are in bridge mode or not. */
1466 if (arg_network_bridge)
1467 memcpy(iface_name, "vb-", 3);
1469 memcpy(iface_name, "ve-", 3);
1470 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1474 log_error("Failed to generate predictable MAC address for host0");
1478 r = sd_rtnl_open(&rtnl, 0);
1480 log_error("Failed to connect to netlink: %s", strerror(-r));
1484 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1486 log_error("Failed to allocate netlink message: %s", strerror(-r));
1490 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1492 log_error("Failed to add netlink interface name: %s", strerror(-r));
1496 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1498 log_error("Failed to open netlink container: %s", strerror(-r));
1502 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1504 log_error("Failed to open netlink container: %s", strerror(-r));
1508 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1510 log_error("Failed to open netlink container: %s", strerror(-r));
1514 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1516 log_error("Failed to add netlink interface name: %s", strerror(-r));
1520 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1522 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1526 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1528 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1532 r = sd_rtnl_message_close_container(m);
1534 log_error("Failed to close netlink container: %s", strerror(-r));
1538 r = sd_rtnl_message_close_container(m);
1540 log_error("Failed to close netlink container: %s", strerror(-r));
1544 r = sd_rtnl_message_close_container(m);
1546 log_error("Failed to close netlink container: %s", strerror(-r));
1550 r = sd_rtnl_call(rtnl, m, 0, NULL);
1552 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1559 static int setup_bridge(const char veth_name[]) {
1560 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1561 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1564 if (!arg_private_network)
1567 if (!arg_network_veth)
1570 if (!arg_network_bridge)
1573 bridge = (int) if_nametoindex(arg_network_bridge);
1575 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1579 r = sd_rtnl_open(&rtnl, 0);
1581 log_error("Failed to connect to netlink: %s", strerror(-r));
1585 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1587 log_error("Failed to allocate netlink message: %s", strerror(-r));
1591 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1593 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1597 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1599 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1603 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1605 log_error("Failed to add netlink master field: %s", strerror(-r));
1609 r = sd_rtnl_call(rtnl, m, 0, NULL);
1611 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1618 static int parse_interface(struct udev *udev, const char *name) {
1619 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1620 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1623 ifi = (int) if_nametoindex(name);
1625 log_error("Failed to resolve interface %s: %m", name);
1629 sprintf(ifi_str, "n%i", ifi);
1630 d = udev_device_new_from_device_id(udev, ifi_str);
1632 log_error("Failed to get udev device for interface %s: %m", name);
1636 if (udev_device_get_is_initialized(d) <= 0) {
1637 log_error("Network interface %s is not initialized yet.", name);
1644 static int move_network_interfaces(pid_t pid) {
1645 _cleanup_udev_unref_ struct udev *udev = NULL;
1646 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1650 if (!arg_private_network)
1653 if (strv_isempty(arg_network_interfaces))
1656 r = sd_rtnl_open(&rtnl, 0);
1658 log_error("Failed to connect to netlink: %s", strerror(-r));
1664 log_error("Failed to connect to udev.");
1668 STRV_FOREACH(i, arg_network_interfaces) {
1669 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1672 ifi = parse_interface(udev, *i);
1676 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1678 log_error("Failed to allocate netlink message: %s", strerror(-r));
1682 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1684 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1688 r = sd_rtnl_call(rtnl, m, 0, NULL);
1690 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1698 static int setup_macvlan(pid_t pid) {
1699 _cleanup_udev_unref_ struct udev *udev = NULL;
1700 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1704 if (!arg_private_network)
1707 if (strv_isempty(arg_network_macvlan))
1710 r = sd_rtnl_open(&rtnl, 0);
1712 log_error("Failed to connect to netlink: %s", strerror(-r));
1718 log_error("Failed to connect to udev.");
1722 STRV_FOREACH(i, arg_network_macvlan) {
1723 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1724 _cleanup_free_ char *n = NULL;
1727 ifi = parse_interface(udev, *i);
1731 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1733 log_error("Failed to allocate netlink message: %s", strerror(-r));
1737 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1739 log_error("Failed to add netlink interface index: %s", strerror(-r));
1743 n = strappend("mv-", *i);
1747 strshorten(n, IFNAMSIZ-1);
1749 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1751 log_error("Failed to add netlink interface name: %s", strerror(-r));
1755 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1757 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1761 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1763 log_error("Failed to open netlink container: %s", strerror(-r));
1767 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1769 log_error("Failed to open netlink container: %s", strerror(-r));
1773 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1775 log_error("Failed to append macvlan mode: %s", strerror(-r));
1779 r = sd_rtnl_message_close_container(m);
1781 log_error("Failed to close netlink container: %s", strerror(-r));
1785 r = sd_rtnl_message_close_container(m);
1787 log_error("Failed to close netlink container: %s", strerror(-r));
1791 r = sd_rtnl_call(rtnl, m, 0, NULL);
1793 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1801 static int audit_still_doesnt_work_in_containers(void) {
1804 scmp_filter_ctx seccomp;
1808 Audit is broken in containers, much of the userspace audit
1809 hookup will fail if running inside a container. We don't
1810 care and just turn off creation of audit sockets.
1812 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1813 with EAFNOSUPPORT which audit userspace uses as indication
1814 that audit is disabled in the kernel.
1817 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1821 r = seccomp_add_secondary_archs(seccomp);
1823 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1827 r = seccomp_rule_add(
1829 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1832 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1833 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1835 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1839 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1841 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1845 r = seccomp_load(seccomp);
1847 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1850 seccomp_release(seccomp);
1858 static int setup_image(char **device_path, int *loop_nr) {
1859 struct loop_info64 info = {
1860 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1862 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1863 _cleanup_free_ char* loopdev = NULL;
1867 assert(device_path);
1870 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1872 log_error("Failed to open %s: %m", arg_image);
1876 if (fstat(fd, &st) < 0) {
1877 log_error("Failed to stat %s: %m", arg_image);
1881 if (S_ISBLK(st.st_mode)) {
1884 p = strdup(arg_image);
1898 if (!S_ISREG(st.st_mode)) {
1899 log_error("%s is not a regular file or block device: %m", arg_image);
1903 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1905 log_error("Failed to open /dev/loop-control: %m");
1909 nr = ioctl(control, LOOP_CTL_GET_FREE);
1911 log_error("Failed to allocate loop device: %m");
1915 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1918 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1920 log_error("Failed to open loop device %s: %m", loopdev);
1924 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1925 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1930 info.lo_flags |= LO_FLAGS_READ_ONLY;
1932 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1933 log_error("Failed to set loopback settings on %s: %m", loopdev);
1937 *device_path = loopdev;
1948 static int dissect_image(
1950 char **root_device, bool *root_device_rw,
1951 char **home_device, bool *home_device_rw,
1952 char **srv_device, bool *srv_device_rw,
1956 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1957 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1958 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1959 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1960 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1961 _cleanup_udev_unref_ struct udev *udev = NULL;
1962 struct udev_list_entry *first, *item;
1963 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1964 const char *pttype = NULL;
1970 assert(root_device);
1971 assert(home_device);
1975 b = blkid_new_probe();
1980 r = blkid_probe_set_device(b, fd, 0, 0);
1985 log_error("Failed to set device on blkid probe: %m");
1989 blkid_probe_enable_partitions(b, 1);
1990 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1993 r = blkid_do_safeprobe(b);
1994 if (r == -2 || r == 1) {
1995 log_error("Failed to identify any partition table on %s.\n"
1996 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1998 } else if (r != 0) {
2001 log_error("Failed to probe: %m");
2005 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2006 if (!streq_ptr(pttype, "gpt")) {
2007 log_error("Image %s does not carry a GUID Partition Table.\n"
2008 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2013 pl = blkid_probe_get_partitions(b);
2018 log_error("Failed to list partitions of %s", arg_image);
2026 if (fstat(fd, &st) < 0) {
2027 log_error("Failed to stat block device: %m");
2031 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2035 e = udev_enumerate_new(udev);
2039 r = udev_enumerate_add_match_parent(e, d);
2043 r = udev_enumerate_scan_devices(e);
2045 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2049 first = udev_enumerate_get_list_entry(e);
2050 udev_list_entry_foreach(item, first) {
2051 _cleanup_udev_device_unref_ struct udev_device *q;
2052 const char *stype, *node;
2053 unsigned long long flags;
2060 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2065 log_error("Failed to get partition device of %s: %m", arg_image);
2069 qn = udev_device_get_devnum(q);
2073 if (st.st_rdev == qn)
2076 node = udev_device_get_devnode(q);
2080 pp = blkid_partlist_devno_to_partition(pl, qn);
2084 flags = blkid_partition_get_flags(pp);
2085 if (flags & GPT_FLAG_NO_AUTO)
2088 nr = blkid_partition_get_partno(pp);
2092 stype = blkid_partition_get_type_string(pp);
2096 if (sd_id128_from_string(stype, &type_id) < 0)
2099 if (sd_id128_equal(type_id, GPT_HOME)) {
2101 if (home && nr >= home_nr)
2105 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2108 home = strdup(node);
2111 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2113 if (srv && nr >= srv_nr)
2117 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2124 #ifdef GPT_ROOT_NATIVE
2125 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2127 if (root && nr >= root_nr)
2131 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2134 root = strdup(node);
2139 #ifdef GPT_ROOT_SECONDARY
2140 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2142 if (secondary_root && nr >= secondary_root_nr)
2145 secondary_root_nr = nr;
2146 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2149 free(secondary_root);
2150 secondary_root = strdup(node);
2151 if (!secondary_root)
2157 if (!root && !secondary_root) {
2158 log_error("Failed to identify root partition in disk image %s.\n"
2159 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2164 *root_device = root;
2167 *root_device_rw = root_rw;
2169 } else if (secondary_root) {
2170 *root_device = secondary_root;
2171 secondary_root = NULL;
2173 *root_device_rw = secondary_root_rw;
2178 *home_device = home;
2181 *home_device_rw = home_rw;
2188 *srv_device_rw = srv_rw;
2193 log_error("--image= is not supported, compiled without blkid support.");
2198 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2200 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2201 const char *fstype, *p;
2211 p = strappenda(where, directory);
2216 b = blkid_new_probe_from_filename(what);
2220 log_error("Failed to allocate prober for %s: %m", what);
2224 blkid_probe_enable_superblocks(b, 1);
2225 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2228 r = blkid_do_safeprobe(b);
2229 if (r == -1 || r == 1) {
2230 log_error("Cannot determine file system type of %s", what);
2232 } else if (r != 0) {
2235 log_error("Failed to probe %s: %m", what);
2240 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2243 log_error("Failed to determine file system type of %s", what);
2247 if (streq(fstype, "crypto_LUKS")) {
2248 log_error("nspawn currently does not support LUKS disk images.");
2252 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2253 log_error("Failed to mount %s: %m", what);
2259 log_error("--image= is not supported, compiled without blkid support.");
2264 static int mount_devices(
2266 const char *root_device, bool root_device_rw,
2267 const char *home_device, bool home_device_rw,
2268 const char *srv_device, bool srv_device_rw) {
2274 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2276 log_error("Failed to mount root directory: %s", strerror(-r));
2282 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2284 log_error("Failed to mount home directory: %s", strerror(-r));
2290 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2292 log_error("Failed to mount server data directory: %s", strerror(-r));
2300 static void loop_remove(int nr, int *image_fd) {
2301 _cleanup_close_ int control = -1;
2306 if (image_fd && *image_fd >= 0) {
2307 ioctl(*image_fd, LOOP_CLR_FD);
2308 *image_fd = safe_close(*image_fd);
2311 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2315 ioctl(control, LOOP_CTL_REMOVE, nr);
2318 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2326 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2327 log_error("Failed to allocate pipe: %m");
2333 log_error("Failed to fork getent child: %m");
2335 } else if (pid == 0) {
2337 char *empty_env = NULL;
2339 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2340 _exit(EXIT_FAILURE);
2342 if (pipe_fds[0] > 2)
2343 safe_close(pipe_fds[0]);
2344 if (pipe_fds[1] > 2)
2345 safe_close(pipe_fds[1]);
2347 nullfd = open("/dev/null", O_RDWR);
2349 _exit(EXIT_FAILURE);
2351 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2352 _exit(EXIT_FAILURE);
2354 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2355 _exit(EXIT_FAILURE);
2360 reset_all_signal_handlers();
2361 close_all_fds(NULL, 0);
2363 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2364 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2365 _exit(EXIT_FAILURE);
2368 pipe_fds[1] = safe_close(pipe_fds[1]);
2375 static int change_uid_gid(char **_home) {
2376 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2377 _cleanup_free_ uid_t *uids = NULL;
2378 _cleanup_free_ char *home = NULL;
2379 _cleanup_fclose_ FILE *f = NULL;
2380 _cleanup_close_ int fd = -1;
2381 unsigned n_uids = 0;
2390 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2391 /* Reset everything fully to 0, just in case */
2393 if (setgroups(0, NULL) < 0) {
2394 log_error("setgroups() failed: %m");
2398 if (setresgid(0, 0, 0) < 0) {
2399 log_error("setregid() failed: %m");
2403 if (setresuid(0, 0, 0) < 0) {
2404 log_error("setreuid() failed: %m");
2412 /* First, get user credentials */
2413 fd = spawn_getent("passwd", arg_user, &pid);
2417 f = fdopen(fd, "r");
2422 if (!fgets(line, sizeof(line), f)) {
2425 log_error("Failed to resolve user %s.", arg_user);
2429 log_error("Failed to read from getent: %m");
2435 wait_for_terminate_and_warn("getent passwd", pid);
2437 x = strchr(line, ':');
2439 log_error("/etc/passwd entry has invalid user field.");
2443 u = strchr(x+1, ':');
2445 log_error("/etc/passwd entry has invalid password field.");
2452 log_error("/etc/passwd entry has invalid UID field.");
2460 log_error("/etc/passwd entry has invalid GID field.");
2465 h = strchr(x+1, ':');
2467 log_error("/etc/passwd entry has invalid GECOS field.");
2474 log_error("/etc/passwd entry has invalid home directory field.");
2480 r = parse_uid(u, &uid);
2482 log_error("Failed to parse UID of user.");
2486 r = parse_gid(g, &gid);
2488 log_error("Failed to parse GID of user.");
2496 /* Second, get group memberships */
2497 fd = spawn_getent("initgroups", arg_user, &pid);
2502 f = fdopen(fd, "r");
2507 if (!fgets(line, sizeof(line), f)) {
2509 log_error("Failed to resolve user %s.", arg_user);
2513 log_error("Failed to read from getent: %m");
2519 wait_for_terminate_and_warn("getent initgroups", pid);
2521 /* Skip over the username and subsequent separator whitespace */
2523 x += strcspn(x, WHITESPACE);
2524 x += strspn(x, WHITESPACE);
2526 FOREACH_WORD(w, l, x, state) {
2532 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2535 r = parse_uid(c, &uids[n_uids++]);
2537 log_error("Failed to parse group data from getent.");
2542 r = mkdir_parents(home, 0775);
2544 log_error("Failed to make home root directory: %s", strerror(-r));
2548 r = mkdir_safe(home, 0755, uid, gid);
2549 if (r < 0 && r != -EEXIST) {
2550 log_error("Failed to make home directory: %s", strerror(-r));
2554 fchown(STDIN_FILENO, uid, gid);
2555 fchown(STDOUT_FILENO, uid, gid);
2556 fchown(STDERR_FILENO, uid, gid);
2558 if (setgroups(n_uids, uids) < 0) {
2559 log_error("Failed to set auxiliary groups: %m");
2563 if (setresgid(gid, gid, gid) < 0) {
2564 log_error("setregid() failed: %m");
2568 if (setresuid(uid, uid, uid) < 0) {
2569 log_error("setreuid() failed: %m");
2582 * Return 0 in case the container is being rebooted, has been shut
2583 * down or exited successfully. On failures a negative value is
2586 * The status of the container "CONTAINER_TERMINATED" or
2587 * "CONTAINER_REBOOTED" will be saved in the container argument
2589 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2593 r = wait_for_terminate(pid, &status);
2597 switch (status.si_code) {
2599 r = status.si_status;
2602 log_debug("Container %s exited successfully.",
2605 *container = CONTAINER_TERMINATED;
2607 log_error("Container %s failed with error code %i.",
2608 arg_machine, status.si_status);
2614 if (status.si_status == SIGINT) {
2616 log_info("Container %s has been shut down.",
2619 *container = CONTAINER_TERMINATED;
2622 } else if (status.si_status == SIGHUP) {
2624 log_info("Container %s is being rebooted.",
2627 *container = CONTAINER_REBOOTED;
2631 /* CLD_KILLED fallthrough */
2634 log_error("Container %s terminated by signal %s.",
2635 arg_machine, signal_to_string(status.si_status));
2640 log_error("Container %s failed due to unknown reason.",
2649 static void nop_handler(int sig) {}
2651 int main(int argc, char *argv[]) {
2653 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2654 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2655 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2656 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2657 _cleanup_fdset_free_ FDSet *fds = NULL;
2658 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2659 const char *console = NULL;
2660 char veth_name[IFNAMSIZ];
2661 bool secondary = false;
2662 sigset_t mask, mask_chld;
2665 log_parse_environment();
2668 k = parse_argv(argc, argv);
2677 if (arg_directory) {
2680 p = path_make_absolute_cwd(arg_directory);
2681 free(arg_directory);
2684 arg_directory = get_current_dir_name();
2686 if (!arg_directory) {
2687 log_error("Failed to determine path, please use -D.");
2690 path_kill_slashes(arg_directory);
2694 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2700 hostname_cleanup(arg_machine, false);
2701 if (isempty(arg_machine)) {
2702 log_error("Failed to determine machine name automatically, please use -M.");
2707 if (geteuid() != 0) {
2708 log_error("Need to be root.");
2712 if (sd_booted() <= 0) {
2713 log_error("Not running on a systemd system.");
2718 n_fd_passed = sd_listen_fds(false);
2719 if (n_fd_passed > 0) {
2720 k = fdset_new_listen_fds(&fds, false);
2722 log_error("Failed to collect file descriptors: %s", strerror(-k));
2726 fdset_close_others(fds);
2729 if (arg_directory) {
2730 if (path_equal(arg_directory, "/")) {
2731 log_error("Spawning container on root directory not supported.");
2736 if (path_is_os_tree(arg_directory) <= 0) {
2737 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2743 p = strappenda(arg_directory,
2744 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2745 if (access(p, F_OK) < 0) {
2746 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2752 char template[] = "/tmp/nspawn-root-XXXXXX";
2754 if (!mkdtemp(template)) {
2755 log_error("Failed to create temporary directory: %m");
2760 arg_directory = strdup(template);
2761 if (!arg_directory) {
2766 image_fd = setup_image(&device_path, &loop_nr);
2772 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2777 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2779 log_error("Failed to acquire pseudo tty: %m");
2783 console = ptsname(master);
2785 log_error("Failed to determine tty name: %m");
2790 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2791 arg_machine, arg_image ? arg_image : arg_directory);
2793 if (unlockpt(master) < 0) {
2794 log_error("Failed to unlock tty: %m");
2798 if (access("/dev/kdbus/control", F_OK) >= 0) {
2800 if (arg_share_system) {
2801 kdbus_domain = strdup("/dev/kdbus");
2802 if (!kdbus_domain) {
2809 ns = strappenda("machine-", arg_machine);
2810 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2812 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2814 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2818 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2819 log_error("Failed to create kmsg socket pair: %m");
2823 sd_notify(0, "READY=1");
2825 assert_se(sigemptyset(&mask) == 0);
2826 assert_se(sigemptyset(&mask_chld) == 0);
2827 sigaddset(&mask_chld, SIGCHLD);
2828 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2829 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2832 ContainerStatus container_status;
2833 int eventfds[2] = { -1, -1 };
2834 struct sigaction sa = {
2835 .sa_handler = nop_handler,
2836 .sa_flags = SA_NOCLDSTOP,
2839 /* Child can be killed before execv(), so handle SIGCHLD
2840 * in order to interrupt parent's blocking calls and
2841 * give it a chance to call wait() and terminate. */
2842 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2844 log_error("Failed to change the signal mask: %m");
2848 r = sigaction(SIGCHLD, &sa, NULL);
2850 log_error("Failed to install SIGCHLD handler: %m");
2854 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2855 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2856 (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2858 if (errno == EINVAL)
2859 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2861 log_error("clone() failed: %m");
2869 _cleanup_free_ char *home = NULL;
2871 const char *envp[] = {
2872 "PATH=" DEFAULT_PATH_SPLIT_USR,
2873 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2878 NULL, /* container_uuid */
2879 NULL, /* LISTEN_FDS */
2880 NULL, /* LISTEN_PID */
2885 envp[n_env] = strv_find_prefix(environ, "TERM=");
2889 master = safe_close(master);
2891 close_nointr(STDIN_FILENO);
2892 close_nointr(STDOUT_FILENO);
2893 close_nointr(STDERR_FILENO);
2895 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2897 reset_all_signal_handlers();
2899 assert_se(sigemptyset(&mask) == 0);
2900 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2902 k = open_terminal(console, O_RDWR);
2903 if (k != STDIN_FILENO) {
2909 log_error("Failed to open console: %s", strerror(-k));
2913 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2914 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2915 log_error("Failed to duplicate console: %m");
2920 log_error("setsid() failed: %m");
2924 if (reset_audit_loginuid() < 0)
2927 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2928 log_error("PR_SET_PDEATHSIG failed: %m");
2932 /* Mark everything as slave, so that we still
2933 * receive mounts from the real root, but don't
2934 * propagate mounts to the real root. */
2935 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2936 log_error("MS_SLAVE|MS_REC failed: %m");
2940 if (mount_devices(arg_directory,
2941 root_device, root_device_rw,
2942 home_device, home_device_rw,
2943 srv_device, srv_device_rw) < 0)
2946 /* Turn directory into bind mount */
2947 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2948 log_error("Failed to make bind mount: %m");
2952 if (arg_read_only) {
2953 k = bind_remount_recursive(arg_directory, true);
2955 log_error("Failed to make tree read-only: %s", strerror(-k));
2960 if (mount_all(arg_directory) < 0)
2963 if (copy_devnodes(arg_directory) < 0)
2966 if (setup_ptmx(arg_directory) < 0)
2969 dev_setup(arg_directory);
2971 if (audit_still_doesnt_work_in_containers() < 0)
2974 if (setup_dev_console(arg_directory, console) < 0)
2977 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2980 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2982 if (setup_boot_id(arg_directory) < 0)
2985 if (setup_timezone(arg_directory) < 0)
2988 if (setup_resolv_conf(arg_directory) < 0)
2991 if (setup_journal(arg_directory) < 0)
2994 if (mount_binds(arg_directory, arg_bind, false) < 0)
2997 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3000 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3003 /* Tell the parent that we are ready, and that
3004 * it can cgroupify us to that we lack access
3005 * to certain devices and resources. */
3006 r = eventfd_send_state(eventfds[1],
3007 EVENTFD_CHILD_SUCCEEDED);
3008 eventfds[1] = safe_close(eventfds[1]);
3012 if (chdir(arg_directory) < 0) {
3013 log_error("chdir(%s) failed: %m", arg_directory);
3017 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3018 log_error("mount(MS_MOVE) failed: %m");
3022 if (chroot(".") < 0) {
3023 log_error("chroot() failed: %m");
3027 if (chdir("/") < 0) {
3028 log_error("chdir() failed: %m");
3034 if (arg_private_network)
3037 if (drop_capabilities() < 0) {
3038 log_error("drop_capabilities() failed: %m");
3042 r = change_uid_gid(&home);
3046 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3047 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3048 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3053 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3056 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3062 if (fdset_size(fds) > 0) {
3063 k = fdset_cloexec(fds, false);
3065 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3069 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3070 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3078 if (arg_personality != 0xffffffffLU) {
3079 if (personality(arg_personality) < 0) {
3080 log_error("personality() failed: %m");
3083 } else if (secondary) {
3084 if (personality(PER_LINUX32) < 0) {
3085 log_error("personality() failed: %m");
3091 if (arg_selinux_context)
3092 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3093 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3098 if (!strv_isempty(arg_setenv)) {
3101 n = strv_env_merge(2, envp, arg_setenv);
3109 env_use = (char**) envp;
3111 /* Wait until the parent is ready with the setup, too... */
3112 r = eventfd_parent_succeeded(eventfds[0]);
3113 eventfds[0] = safe_close(eventfds[0]);
3121 /* Automatically search for the init system */
3123 l = 1 + argc - optind;
3124 a = newa(char*, l + 1);
3125 memcpy(a + 1, argv + optind, l * sizeof(char*));
3127 a[0] = (char*) "/usr/lib/systemd/systemd";
3128 execve(a[0], a, env_use);
3130 a[0] = (char*) "/lib/systemd/systemd";
3131 execve(a[0], a, env_use);
3133 a[0] = (char*) "/sbin/init";
3134 execve(a[0], a, env_use);
3135 } else if (argc > optind)
3136 execvpe(argv[optind], argv + optind, env_use);
3138 chdir(home ? home : "/root");
3139 execle("/bin/bash", "-bash", NULL, env_use);
3140 execle("/bin/sh", "-sh", NULL, env_use);
3143 log_error("execv() failed: %m");
3146 /* Tell the parent that the setup failed, so he
3147 * can clean up resources and terminate. */
3148 if (eventfds[1] != -1)
3149 eventfd_send_state(eventfds[1],
3150 EVENTFD_CHILD_FAILED);
3151 _exit(EXIT_FAILURE);
3157 /* Wait for the child event:
3158 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3159 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3160 * it is ready with all it needs to do with priviliges.
3161 * After we got the notification we can make the process
3162 * join its cgroup which might limit what it can do */
3163 r = eventfd_child_succeeded(eventfds[1]);
3164 eventfds[1] = safe_close(eventfds[1]);
3166 goto check_container_status;
3168 r = register_machine(pid);
3172 r = move_network_interfaces(pid);
3176 r = setup_veth(pid, veth_name);
3180 r = setup_bridge(veth_name);
3184 r = setup_macvlan(pid);
3188 /* Block SIGCHLD here, before notifying child.
3189 * process_pty() will handle it with the other signals. */
3190 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3194 /* Reset signal to default */
3195 r = default_signals(SIGCHLD, -1);
3199 /* Notify the child that the parent is ready with all
3200 * its setup, and that the child can now hand over
3201 * control to the code to run inside the container. */
3202 r = eventfd_send_state(eventfds[0],
3203 EVENTFD_PARENT_SUCCEEDED);
3204 eventfds[0] = safe_close(eventfds[0]);
3208 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3217 /* Kill if it is not dead yet anyway */
3218 terminate_machine(pid);
3220 check_container_status:
3221 /* Redundant, but better safe than sorry */
3224 r = wait_for_container(pid, &container_status);
3230 } else if (container_status == CONTAINER_TERMINATED)
3233 /* CONTAINER_REBOOTED, loop again */
3237 loop_remove(loop_nr, &image_fd);
3242 free(arg_directory);
3245 strv_free(arg_setenv);
3246 strv_free(arg_network_interfaces);
3247 strv_free(arg_network_macvlan);
3248 strv_free(arg_bind);
3249 strv_free(arg_bind_ro);