1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
90 #include "siphash24.h"
94 #include "seccomp-util.h"
97 typedef enum ContainerStatus {
102 typedef enum LinkJournal {
109 static char *arg_directory = NULL;
110 static char *arg_user = NULL;
111 static sd_id128_t arg_uuid = {};
112 static char *arg_machine = NULL;
113 static const char *arg_selinux_context = NULL;
114 static const char *arg_selinux_apifs_context = NULL;
115 static const char *arg_slice = NULL;
116 static bool arg_private_network = false;
117 static bool arg_read_only = false;
118 static bool arg_boot = false;
119 static LinkJournal arg_link_journal = LINK_AUTO;
120 static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
142 (1ULL << CAP_SYS_RESOURCE) |
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
145 (1ULL << CAP_AUDIT_CONTROL) |
147 static char **arg_bind = NULL;
148 static char **arg_bind_ro = NULL;
149 static char **arg_setenv = NULL;
150 static bool arg_quiet = false;
151 static bool arg_share_system = false;
152 static bool arg_register = true;
153 static bool arg_keep_unit = false;
154 static char **arg_network_interfaces = NULL;
155 static char **arg_network_macvlan = NULL;
156 static bool arg_network_veth = false;
157 static const char *arg_network_bridge = NULL;
158 static unsigned long arg_personality = 0xffffffffLU;
159 static const char *arg_image = NULL;
161 static int help(void) {
163 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
164 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
165 " -h --help Show this help\n"
166 " --version Print version string\n"
167 " -q --quiet Do not show status information\n"
168 " -D --directory=PATH Root directory for the container\n"
169 " -i --image=PATH File system device or image for the container\n"
170 " -b --boot Boot up full system (i.e. invoke init)\n"
171 " -u --user=USER Run the command under specified user or uid\n"
172 " -M --machine=NAME Set the machine name for the container\n"
173 " --uuid=UUID Set a specific machine UUID for the container\n"
174 " -S --slice=SLICE Place the container in the specified slice\n"
175 " --private-network Disable network in container\n"
176 " --network-interface=INTERFACE\n"
177 " Assign an existing network interface to the\n"
179 " --network-macvlan=INTERFACE\n"
180 " Create a macvlan network interface based on an\n"
181 " existing network interface to the container\n"
182 " --network-veth Add a virtual ethernet connection between host\n"
184 " --network-bridge=INTERFACE\n"
185 " Add a virtual ethernet connection between host\n"
186 " and container and add it to an existing bridge on\n"
188 " -Z --selinux-context=SECLABEL\n"
189 " Set the SELinux security context to be used by\n"
190 " processes in the container\n"
191 " -L --selinux-apifs-context=SECLABEL\n"
192 " Set the SELinux security context to be used by\n"
193 " API/tmpfs file systems in the container\n"
194 " --capability=CAP In addition to the default, retain specified\n"
196 " --drop-capability=CAP Drop the specified capability from the default set\n"
197 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
198 " -j Equivalent to --link-journal=host\n"
199 " --read-only Mount the root directory read-only\n"
200 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
202 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
203 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
204 " --share-system Share system namespaces with host\n"
205 " --register=BOOLEAN Register container as machine\n"
206 " --keep-unit Do not register a scope for the machine, reuse\n"
207 " the service unit nspawn is running in\n",
208 program_invocation_short_name);
213 static int parse_argv(int argc, char *argv[]) {
229 ARG_NETWORK_INTERFACE,
236 static const struct option options[] = {
237 { "help", no_argument, NULL, 'h' },
238 { "version", no_argument, NULL, ARG_VERSION },
239 { "directory", required_argument, NULL, 'D' },
240 { "user", required_argument, NULL, 'u' },
241 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
242 { "boot", no_argument, NULL, 'b' },
243 { "uuid", required_argument, NULL, ARG_UUID },
244 { "read-only", no_argument, NULL, ARG_READ_ONLY },
245 { "capability", required_argument, NULL, ARG_CAPABILITY },
246 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
247 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
248 { "bind", required_argument, NULL, ARG_BIND },
249 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
250 { "machine", required_argument, NULL, 'M' },
251 { "slice", required_argument, NULL, 'S' },
252 { "setenv", required_argument, NULL, ARG_SETENV },
253 { "selinux-context", required_argument, NULL, 'Z' },
254 { "selinux-apifs-context", required_argument, NULL, 'L' },
255 { "quiet", no_argument, NULL, 'q' },
256 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
257 { "register", required_argument, NULL, ARG_REGISTER },
258 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
259 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
260 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
261 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
262 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
263 { "personality", required_argument, NULL, ARG_PERSONALITY },
264 { "image", required_argument, NULL, 'i' },
269 uint64_t plus = 0, minus = 0;
274 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
282 puts(PACKAGE_STRING);
283 puts(SYSTEMD_FEATURES);
288 arg_directory = canonicalize_file_name(optarg);
289 if (!arg_directory) {
290 log_error("Invalid root directory: %m");
302 arg_user = strdup(optarg);
308 case ARG_NETWORK_BRIDGE:
309 arg_network_bridge = optarg;
313 case ARG_NETWORK_VETH:
314 arg_network_veth = true;
315 arg_private_network = true;
318 case ARG_NETWORK_INTERFACE:
319 if (strv_extend(&arg_network_interfaces, optarg) < 0)
322 arg_private_network = true;
325 case ARG_NETWORK_MACVLAN:
326 if (strv_extend(&arg_network_macvlan, optarg) < 0)
331 case ARG_PRIVATE_NETWORK:
332 arg_private_network = true;
340 r = sd_id128_from_string(optarg, &arg_uuid);
342 log_error("Invalid UUID: %s", optarg);
352 if (isempty(optarg)) {
357 if (!hostname_is_valid(optarg)) {
358 log_error("Invalid machine name: %s", optarg);
363 arg_machine = strdup(optarg);
371 arg_selinux_context = optarg;
375 arg_selinux_apifs_context = optarg;
379 arg_read_only = true;
383 case ARG_DROP_CAPABILITY: {
387 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
388 _cleanup_free_ char *t;
391 t = strndup(word, length);
395 if (streq(t, "all")) {
396 if (c == ARG_CAPABILITY)
397 plus = (uint64_t) -1;
399 minus = (uint64_t) -1;
401 if (cap_from_name(t, &cap) < 0) {
402 log_error("Failed to parse capability %s.", t);
406 if (c == ARG_CAPABILITY)
407 plus |= 1ULL << (uint64_t) cap;
409 minus |= 1ULL << (uint64_t) cap;
417 arg_link_journal = LINK_GUEST;
420 case ARG_LINK_JOURNAL:
421 if (streq(optarg, "auto"))
422 arg_link_journal = LINK_AUTO;
423 else if (streq(optarg, "no"))
424 arg_link_journal = LINK_NO;
425 else if (streq(optarg, "guest"))
426 arg_link_journal = LINK_GUEST;
427 else if (streq(optarg, "host"))
428 arg_link_journal = LINK_HOST;
430 log_error("Failed to parse link journal mode %s", optarg);
438 _cleanup_free_ char *a = NULL, *b = NULL;
442 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
444 e = strchr(optarg, ':');
446 a = strndup(optarg, e - optarg);
456 if (!path_is_absolute(a) || !path_is_absolute(b)) {
457 log_error("Invalid bind mount specification: %s", optarg);
461 r = strv_extend(x, a);
465 r = strv_extend(x, b);
475 if (!env_assignment_is_valid(optarg)) {
476 log_error("Environment variable assignment '%s' is not valid.", optarg);
480 n = strv_env_set(arg_setenv, optarg);
484 strv_free(arg_setenv);
493 case ARG_SHARE_SYSTEM:
494 arg_share_system = true;
498 r = parse_boolean(optarg);
500 log_error("Failed to parse --register= argument: %s", optarg);
508 arg_keep_unit = true;
511 case ARG_PERSONALITY:
513 arg_personality = personality_from_string(optarg);
514 if (arg_personality == 0xffffffffLU) {
515 log_error("Unknown or unsupported personality '%s'.", optarg);
525 assert_not_reached("Unhandled option");
529 if (arg_share_system)
530 arg_register = false;
532 if (arg_boot && arg_share_system) {
533 log_error("--boot and --share-system may not be combined.");
537 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
538 log_error("--keep-unit may not be used when invoked from a user session.");
542 if (arg_directory && arg_image) {
543 log_error("--directory= and --image= may not be combined.");
547 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
552 static int mount_all(const char *dest) {
554 typedef struct MountPoint {
563 static const MountPoint mount_table[] = {
564 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
565 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
566 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
567 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
568 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
569 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
570 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
571 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
573 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
574 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
581 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
582 _cleanup_free_ char *where = NULL;
584 _cleanup_free_ char *options = NULL;
589 where = strjoin(dest, "/", mount_table[k].where, NULL);
593 t = path_is_mount_point(where, true);
595 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
603 /* Skip this entry if it is not a remount. */
604 if (mount_table[k].what && t > 0)
607 mkdir_p(where, 0755);
610 if (arg_selinux_apifs_context &&
611 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
612 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
619 o = mount_table[k].options;
622 if (mount(mount_table[k].what,
625 mount_table[k].flags,
627 mount_table[k].fatal) {
629 log_error("mount(%s) failed: %m", where);
639 static int mount_binds(const char *dest, char **l, bool ro) {
642 STRV_FOREACH_PAIR(x, y, l) {
644 struct stat source_st, dest_st;
647 if (stat(*x, &source_st) < 0) {
648 log_error("Failed to stat %s: %m", *x);
652 where = strappenda(dest, *y);
653 r = stat(where, &dest_st);
655 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
656 log_error("The file types of %s and %s do not match. Refusing bind mount",
660 } else if (errno == ENOENT) {
661 r = mkdir_parents_label(where, 0755);
663 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
667 log_error("Failed to bind mount %s: %m", *x);
670 /* Create the mount point, but be conservative -- refuse to create block
671 * and char devices. */
672 if (S_ISDIR(source_st.st_mode))
673 mkdir_label(where, 0755);
674 else if (S_ISFIFO(source_st.st_mode))
676 else if (S_ISSOCK(source_st.st_mode))
677 mknod(where, 0644 | S_IFSOCK, 0);
678 else if (S_ISREG(source_st.st_mode))
681 log_error("Refusing to create mountpoint for file: %s", *x);
685 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
686 log_error("mount(%s) failed: %m", where);
691 r = bind_remount_recursive(where, true);
693 log_error("Read-Only bind mount failed: %s", strerror(-r));
702 static int setup_timezone(const char *dest) {
703 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
709 /* Fix the timezone, if possible */
710 r = readlink_malloc("/etc/localtime", &p);
712 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
716 z = path_startswith(p, "../usr/share/zoneinfo/");
718 z = path_startswith(p, "/usr/share/zoneinfo/");
720 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
724 where = strappend(dest, "/etc/localtime");
728 r = readlink_malloc(where, &q);
730 y = path_startswith(q, "../usr/share/zoneinfo/");
732 y = path_startswith(q, "/usr/share/zoneinfo/");
735 /* Already pointing to the right place? Then do nothing .. */
736 if (y && streq(y, z))
740 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
744 if (access(check, F_OK) < 0) {
745 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
749 what = strappend("../usr/share/zoneinfo/", z);
754 if (symlink(what, where) < 0) {
755 log_error("Failed to correct timezone of container: %m");
762 static int setup_resolv_conf(const char *dest) {
763 char _cleanup_free_ *where = NULL;
767 if (arg_private_network)
770 /* Fix resolv.conf, if possible */
771 where = strappend(dest, "/etc/resolv.conf");
775 /* We don't really care for the results of this really. If it
776 * fails, it fails, but meh... */
777 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
782 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
785 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
786 SD_ID128_FORMAT_VAL(id));
791 static int setup_boot_id(const char *dest) {
792 _cleanup_free_ char *from = NULL, *to = NULL;
799 if (arg_share_system)
802 /* Generate a new randomized boot ID, so that each boot-up of
803 * the container gets a new one */
805 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
806 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
810 r = sd_id128_randomize(&rnd);
812 log_error("Failed to generate random boot id: %s", strerror(-r));
816 id128_format_as_uuid(rnd, as_uuid);
818 r = write_string_file(from, as_uuid);
820 log_error("Failed to write boot id: %s", strerror(-r));
824 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
825 log_error("Failed to bind mount boot id: %m");
827 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
828 log_warning("Failed to make boot id read-only: %m");
834 static int copy_devnodes(const char *dest) {
836 static const char devnodes[] =
846 _cleanup_umask_ mode_t u;
852 NULSTR_FOREACH(d, devnodes) {
853 _cleanup_free_ char *from = NULL, *to = NULL;
856 from = strappend("/dev/", d);
857 to = strjoin(dest, "/dev/", d, NULL);
861 if (stat(from, &st) < 0) {
863 if (errno != ENOENT) {
864 log_error("Failed to stat %s: %m", from);
868 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
870 log_error("%s is not a char or block device, cannot copy", from);
873 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
875 log_error("mknod(%s) failed: %m", dest);
883 static int setup_ptmx(const char *dest) {
884 _cleanup_free_ char *p = NULL;
886 p = strappend(dest, "/dev/ptmx");
890 if (symlink("pts/ptmx", p) < 0) {
891 log_error("Failed to create /dev/ptmx symlink: %m");
898 static int setup_dev_console(const char *dest, const char *console) {
899 _cleanup_umask_ mode_t u;
909 if (stat("/dev/null", &st) < 0) {
910 log_error("Failed to stat /dev/null: %m");
914 r = chmod_and_chown(console, 0600, 0, 0);
916 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
920 /* We need to bind mount the right tty to /dev/console since
921 * ptys can only exist on pts file systems. To have something
922 * to bind mount things on we create a device node first, and
923 * use /dev/null for that since we the cgroups device policy
924 * allows us to create that freely, while we cannot create
925 * /dev/console. (Note that the major minor doesn't actually
926 * matter here, since we mount it over anyway). */
928 to = strappenda(dest, "/dev/console");
929 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
930 log_error("mknod() for /dev/console failed: %m");
934 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
935 log_error("Bind mount for /dev/console failed: %m");
942 static int setup_kmsg(const char *dest, int kmsg_socket) {
943 _cleanup_free_ char *from = NULL, *to = NULL;
945 _cleanup_umask_ mode_t u;
947 struct cmsghdr cmsghdr;
948 uint8_t buf[CMSG_SPACE(sizeof(int))];
951 .msg_control = &control,
952 .msg_controllen = sizeof(control),
954 struct cmsghdr *cmsg;
957 assert(kmsg_socket >= 0);
961 /* We create the kmsg FIFO as /dev/kmsg, but immediately
962 * delete it after bind mounting it to /proc/kmsg. While FIFOs
963 * on the reading side behave very similar to /proc/kmsg,
964 * their writing side behaves differently from /dev/kmsg in
965 * that writing blocks when nothing is reading. In order to
966 * avoid any problems with containers deadlocking due to this
967 * we simply make /dev/kmsg unavailable to the container. */
968 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
969 asprintf(&to, "%s/proc/kmsg", dest) < 0)
972 if (mkfifo(from, 0600) < 0) {
973 log_error("mkfifo() for /dev/kmsg failed: %m");
977 r = chmod_and_chown(from, 0600, 0, 0);
979 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
983 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
984 log_error("Bind mount for /proc/kmsg failed: %m");
988 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
990 log_error("Failed to open fifo: %m");
994 cmsg = CMSG_FIRSTHDR(&mh);
995 cmsg->cmsg_level = SOL_SOCKET;
996 cmsg->cmsg_type = SCM_RIGHTS;
997 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
998 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1000 mh.msg_controllen = cmsg->cmsg_len;
1002 /* Store away the fd in the socket, so that it stays open as
1003 * long as we run the child */
1004 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1008 log_error("Failed to send FIFO fd: %m");
1012 /* And now make the FIFO unavailable as /dev/kmsg... */
1017 static int setup_hostname(void) {
1019 if (arg_share_system)
1022 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1028 static int setup_journal(const char *directory) {
1029 sd_id128_t machine_id, this_id;
1030 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1034 p = strappend(directory, "/etc/machine-id");
1038 r = read_one_line_file(p, &b);
1039 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1042 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1047 if (isempty(id) && arg_link_journal == LINK_AUTO)
1050 /* Verify validity */
1051 r = sd_id128_from_string(id, &machine_id);
1053 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1057 r = sd_id128_get_machine(&this_id);
1059 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1063 if (sd_id128_equal(machine_id, this_id)) {
1064 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1065 "Host and machine ids are equal (%s): refusing to link journals", id);
1066 if (arg_link_journal == LINK_AUTO)
1072 if (arg_link_journal == LINK_NO)
1076 p = strappend("/var/log/journal/", id);
1077 q = strjoin(directory, "/var/log/journal/", id, NULL);
1081 if (path_is_mount_point(p, false) > 0) {
1082 if (arg_link_journal != LINK_AUTO) {
1083 log_error("%s: already a mount point, refusing to use for journal", p);
1090 if (path_is_mount_point(q, false) > 0) {
1091 if (arg_link_journal != LINK_AUTO) {
1092 log_error("%s: already a mount point, refusing to use for journal", q);
1099 r = readlink_and_make_absolute(p, &d);
1101 if ((arg_link_journal == LINK_GUEST ||
1102 arg_link_journal == LINK_AUTO) &&
1105 r = mkdir_p(q, 0755);
1107 log_warning("failed to create directory %s: %m", q);
1111 if (unlink(p) < 0) {
1112 log_error("Failed to remove symlink %s: %m", p);
1115 } else if (r == -EINVAL) {
1117 if (arg_link_journal == LINK_GUEST &&
1120 if (errno == ENOTDIR) {
1121 log_error("%s already exists and is neither a symlink nor a directory", p);
1124 log_error("Failed to remove %s: %m", p);
1128 } else if (r != -ENOENT) {
1129 log_error("readlink(%s) failed: %m", p);
1133 if (arg_link_journal == LINK_GUEST) {
1135 if (symlink(q, p) < 0) {
1136 log_error("Failed to symlink %s to %s: %m", q, p);
1140 r = mkdir_p(q, 0755);
1142 log_warning("failed to create directory %s: %m", q);
1146 if (arg_link_journal == LINK_HOST) {
1147 r = mkdir_p(p, 0755);
1149 log_error("Failed to create %s: %m", p);
1153 } else if (access(p, F_OK) < 0)
1156 if (dir_is_empty(q) == 0)
1157 log_warning("%s is not empty, proceeding anyway.", q);
1159 r = mkdir_p(q, 0755);
1161 log_error("Failed to create %s: %m", q);
1165 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1166 log_error("Failed to bind mount journal from host into guest: %m");
1173 static int setup_kdbus(const char *dest, const char *path) {
1179 p = strappenda(dest, "/dev/kdbus");
1180 if (mkdir(p, 0755) < 0) {
1181 log_error("Failed to create kdbus path: %m");
1185 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1186 log_error("Failed to mount kdbus domain path: %m");
1193 static int drop_capabilities(void) {
1194 return capability_bounding_set_drop(~arg_retain, false);
1197 static int register_machine(pid_t pid) {
1198 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1199 _cleanup_bus_unref_ sd_bus *bus = NULL;
1205 r = sd_bus_default_system(&bus);
1207 log_error("Failed to open system bus: %s", strerror(-r));
1211 if (arg_keep_unit) {
1212 r = sd_bus_call_method(
1214 "org.freedesktop.machine1",
1215 "/org/freedesktop/machine1",
1216 "org.freedesktop.machine1.Manager",
1222 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1226 strempty(arg_directory));
1228 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1230 r = sd_bus_message_new_method_call(
1233 "org.freedesktop.machine1",
1234 "/org/freedesktop/machine1",
1235 "org.freedesktop.machine1.Manager",
1238 log_error("Failed to create message: %s", strerror(-r));
1242 r = sd_bus_message_append(
1246 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1250 strempty(arg_directory));
1252 log_error("Failed to append message arguments: %s", strerror(-r));
1256 r = sd_bus_message_open_container(m, 'a', "(sv)");
1258 log_error("Failed to open container: %s", strerror(-r));
1262 if (!isempty(arg_slice)) {
1263 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1265 log_error("Failed to append slice: %s", strerror(-r));
1270 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1272 log_error("Failed to add device policy: %s", strerror(-r));
1276 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1277 /* Allow the container to
1278 * access and create the API
1279 * device nodes, so that
1280 * PrivateDevices= in the
1281 * container can work
1286 "/dev/random", "rwm",
1287 "/dev/urandom", "rwm",
1289 /* Allow the container
1290 * access to ptys. However,
1292 * container to ever create
1293 * these device nodes. */
1294 "/dev/pts/ptmx", "rw",
1296 /* Allow the container
1297 * access to all kdbus
1298 * devices. Again, the
1299 * container cannot create
1300 * these nodes, only use
1301 * them. We use a pretty
1302 * open match here, so that
1303 * the kernel API can still
1306 "char-kdbus/*", "rw");
1308 log_error("Failed to add device whitelist: %s", strerror(-r));
1312 r = sd_bus_message_close_container(m);
1314 log_error("Failed to close container: %s", strerror(-r));
1318 r = sd_bus_call(bus, m, 0, &error, NULL);
1322 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1329 static int terminate_machine(pid_t pid) {
1330 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1331 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1332 _cleanup_bus_unref_ sd_bus *bus = NULL;
1339 r = sd_bus_default_system(&bus);
1341 log_error("Failed to open system bus: %s", strerror(-r));
1345 r = sd_bus_call_method(
1347 "org.freedesktop.machine1",
1348 "/org/freedesktop/machine1",
1349 "org.freedesktop.machine1.Manager",
1356 /* Note that the machine might already have been
1357 * cleaned up automatically, hence don't consider it a
1358 * failure if we cannot get the machine object. */
1359 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1363 r = sd_bus_message_read(reply, "o", &path);
1365 return bus_log_parse_error(r);
1367 r = sd_bus_call_method(
1369 "org.freedesktop.machine1",
1371 "org.freedesktop.machine1.Machine",
1377 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1384 static int reset_audit_loginuid(void) {
1385 _cleanup_free_ char *p = NULL;
1388 if (arg_share_system)
1391 r = read_one_line_file("/proc/self/loginuid", &p);
1395 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1399 /* Already reset? */
1400 if (streq(p, "4294967295"))
1403 r = write_string_file("/proc/self/loginuid", "4294967295");
1405 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1406 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1407 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1408 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1409 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1417 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1419 static int get_mac(struct ether_addr *mac) {
1426 l = strlen(arg_machine);
1427 sz = sizeof(sd_id128_t) + l;
1430 /* fetch some persistent data unique to the host */
1431 r = sd_id128_get_machine((sd_id128_t*) v);
1435 /* combine with some data unique (on this host) to this
1436 * container instance */
1437 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1439 /* Let's hash the host machine ID plus the container name. We
1440 * use a fixed, but originally randomly created hash key here. */
1441 siphash24(result, v, sz, HASH_KEY.bytes);
1443 assert_cc(ETH_ALEN <= sizeof(result));
1444 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1446 /* see eth_random_addr in the kernel */
1447 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1448 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1453 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1454 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1455 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1456 struct ether_addr mac;
1459 if (!arg_private_network)
1462 if (!arg_network_veth)
1465 /* Use two different interface name prefixes depending whether
1466 * we are in bridge mode or not. */
1467 if (arg_network_bridge)
1468 memcpy(iface_name, "vb-", 3);
1470 memcpy(iface_name, "ve-", 3);
1471 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1475 log_error("Failed to generate predictable MAC address for host0");
1479 r = sd_rtnl_open(&rtnl, 0);
1481 log_error("Failed to connect to netlink: %s", strerror(-r));
1485 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1487 log_error("Failed to allocate netlink message: %s", strerror(-r));
1491 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1493 log_error("Failed to add netlink interface name: %s", strerror(-r));
1497 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1499 log_error("Failed to open netlink container: %s", strerror(-r));
1503 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1505 log_error("Failed to open netlink container: %s", strerror(-r));
1509 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1511 log_error("Failed to open netlink container: %s", strerror(-r));
1515 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1517 log_error("Failed to add netlink interface name: %s", strerror(-r));
1521 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1523 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1527 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1529 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1533 r = sd_rtnl_message_close_container(m);
1535 log_error("Failed to close netlink container: %s", strerror(-r));
1539 r = sd_rtnl_message_close_container(m);
1541 log_error("Failed to close netlink container: %s", strerror(-r));
1545 r = sd_rtnl_message_close_container(m);
1547 log_error("Failed to close netlink container: %s", strerror(-r));
1551 r = sd_rtnl_call(rtnl, m, 0, NULL);
1553 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1560 static int setup_bridge(const char veth_name[]) {
1561 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1562 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1565 if (!arg_private_network)
1568 if (!arg_network_veth)
1571 if (!arg_network_bridge)
1574 bridge = (int) if_nametoindex(arg_network_bridge);
1576 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1580 r = sd_rtnl_open(&rtnl, 0);
1582 log_error("Failed to connect to netlink: %s", strerror(-r));
1586 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1588 log_error("Failed to allocate netlink message: %s", strerror(-r));
1592 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1594 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1598 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1600 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1604 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1606 log_error("Failed to add netlink master field: %s", strerror(-r));
1610 r = sd_rtnl_call(rtnl, m, 0, NULL);
1612 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1619 static int parse_interface(struct udev *udev, const char *name) {
1620 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1621 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1624 ifi = (int) if_nametoindex(name);
1626 log_error("Failed to resolve interface %s: %m", name);
1630 sprintf(ifi_str, "n%i", ifi);
1631 d = udev_device_new_from_device_id(udev, ifi_str);
1633 log_error("Failed to get udev device for interface %s: %m", name);
1637 if (udev_device_get_is_initialized(d) <= 0) {
1638 log_error("Network interface %s is not initialized yet.", name);
1645 static int move_network_interfaces(pid_t pid) {
1646 _cleanup_udev_unref_ struct udev *udev = NULL;
1647 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1651 if (!arg_private_network)
1654 if (strv_isempty(arg_network_interfaces))
1657 r = sd_rtnl_open(&rtnl, 0);
1659 log_error("Failed to connect to netlink: %s", strerror(-r));
1665 log_error("Failed to connect to udev.");
1669 STRV_FOREACH(i, arg_network_interfaces) {
1670 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1673 ifi = parse_interface(udev, *i);
1677 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1679 log_error("Failed to allocate netlink message: %s", strerror(-r));
1683 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1685 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1689 r = sd_rtnl_call(rtnl, m, 0, NULL);
1691 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1699 static int setup_macvlan(pid_t pid) {
1700 _cleanup_udev_unref_ struct udev *udev = NULL;
1701 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1705 if (!arg_private_network)
1708 if (strv_isempty(arg_network_macvlan))
1711 r = sd_rtnl_open(&rtnl, 0);
1713 log_error("Failed to connect to netlink: %s", strerror(-r));
1719 log_error("Failed to connect to udev.");
1723 STRV_FOREACH(i, arg_network_macvlan) {
1724 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1725 _cleanup_free_ char *n = NULL;
1728 ifi = parse_interface(udev, *i);
1732 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1734 log_error("Failed to allocate netlink message: %s", strerror(-r));
1738 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1740 log_error("Failed to add netlink interface index: %s", strerror(-r));
1744 n = strappend("mv-", *i);
1748 strshorten(n, IFNAMSIZ-1);
1750 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1752 log_error("Failed to add netlink interface name: %s", strerror(-r));
1756 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1758 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1762 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1764 log_error("Failed to open netlink container: %s", strerror(-r));
1768 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1770 log_error("Failed to open netlink container: %s", strerror(-r));
1774 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1776 log_error("Failed to append macvlan mode: %s", strerror(-r));
1780 r = sd_rtnl_message_close_container(m);
1782 log_error("Failed to close netlink container: %s", strerror(-r));
1786 r = sd_rtnl_message_close_container(m);
1788 log_error("Failed to close netlink container: %s", strerror(-r));
1792 r = sd_rtnl_call(rtnl, m, 0, NULL);
1794 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1802 static int audit_still_doesnt_work_in_containers(void) {
1805 scmp_filter_ctx seccomp;
1809 Audit is broken in containers, much of the userspace audit
1810 hookup will fail if running inside a container. We don't
1811 care and just turn off creation of audit sockets.
1813 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1814 with EAFNOSUPPORT which audit userspace uses as indication
1815 that audit is disabled in the kernel.
1818 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1822 r = seccomp_add_secondary_archs(seccomp);
1824 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1828 r = seccomp_rule_add(
1830 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1833 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1834 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1836 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1840 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1842 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1846 r = seccomp_load(seccomp);
1848 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1851 seccomp_release(seccomp);
1859 static int setup_image(char **device_path, int *loop_nr) {
1860 struct loop_info64 info = {
1861 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1863 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1864 _cleanup_free_ char* loopdev = NULL;
1868 assert(device_path);
1871 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1873 log_error("Failed to open %s: %m", arg_image);
1877 if (fstat(fd, &st) < 0) {
1878 log_error("Failed to stat %s: %m", arg_image);
1882 if (S_ISBLK(st.st_mode)) {
1885 p = strdup(arg_image);
1899 if (!S_ISREG(st.st_mode)) {
1900 log_error("%s is not a regular file or block device: %m", arg_image);
1904 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1906 log_error("Failed to open /dev/loop-control: %m");
1910 nr = ioctl(control, LOOP_CTL_GET_FREE);
1912 log_error("Failed to allocate loop device: %m");
1916 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1919 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1921 log_error("Failed to open loop device %s: %m", loopdev);
1925 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1926 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1931 info.lo_flags |= LO_FLAGS_READ_ONLY;
1933 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1934 log_error("Failed to set loopback settings on %s: %m", loopdev);
1938 *device_path = loopdev;
1949 static int dissect_image(
1951 char **root_device, bool *root_device_rw,
1952 char **home_device, bool *home_device_rw,
1953 char **srv_device, bool *srv_device_rw,
1957 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1958 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1959 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1960 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1961 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1962 _cleanup_udev_unref_ struct udev *udev = NULL;
1963 struct udev_list_entry *first, *item;
1964 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1965 const char *pttype = NULL;
1971 assert(root_device);
1972 assert(home_device);
1976 b = blkid_new_probe();
1981 r = blkid_probe_set_device(b, fd, 0, 0);
1986 log_error("Failed to set device on blkid probe: %m");
1990 blkid_probe_enable_partitions(b, 1);
1991 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1994 r = blkid_do_safeprobe(b);
1995 if (r == -2 || r == 1) {
1996 log_error("Failed to identify any partition table on %s.\n"
1997 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1999 } else if (r != 0) {
2002 log_error("Failed to probe: %m");
2006 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2007 if (!streq_ptr(pttype, "gpt")) {
2008 log_error("Image %s does not carry a GUID Partition Table.\n"
2009 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2014 pl = blkid_probe_get_partitions(b);
2019 log_error("Failed to list partitions of %s", arg_image);
2027 if (fstat(fd, &st) < 0) {
2028 log_error("Failed to stat block device: %m");
2032 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2036 e = udev_enumerate_new(udev);
2040 r = udev_enumerate_add_match_parent(e, d);
2044 r = udev_enumerate_scan_devices(e);
2046 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2050 first = udev_enumerate_get_list_entry(e);
2051 udev_list_entry_foreach(item, first) {
2052 _cleanup_udev_device_unref_ struct udev_device *q;
2053 const char *stype, *node;
2054 unsigned long long flags;
2061 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2066 log_error("Failed to get partition device of %s: %m", arg_image);
2070 qn = udev_device_get_devnum(q);
2074 if (st.st_rdev == qn)
2077 node = udev_device_get_devnode(q);
2081 pp = blkid_partlist_devno_to_partition(pl, qn);
2085 flags = blkid_partition_get_flags(pp);
2086 if (flags & GPT_FLAG_NO_AUTO)
2089 nr = blkid_partition_get_partno(pp);
2093 stype = blkid_partition_get_type_string(pp);
2097 if (sd_id128_from_string(stype, &type_id) < 0)
2100 if (sd_id128_equal(type_id, GPT_HOME)) {
2102 if (home && nr >= home_nr)
2106 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2109 home = strdup(node);
2112 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2114 if (srv && nr >= srv_nr)
2118 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2125 #ifdef GPT_ROOT_NATIVE
2126 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2128 if (root && nr >= root_nr)
2132 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2135 root = strdup(node);
2140 #ifdef GPT_ROOT_SECONDARY
2141 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2143 if (secondary_root && nr >= secondary_root_nr)
2146 secondary_root_nr = nr;
2147 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2150 free(secondary_root);
2151 secondary_root = strdup(node);
2152 if (!secondary_root)
2158 if (!root && !secondary_root) {
2159 log_error("Failed to identify root partition in disk image %s.\n"
2160 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2165 *root_device = root;
2168 *root_device_rw = root_rw;
2170 } else if (secondary_root) {
2171 *root_device = secondary_root;
2172 secondary_root = NULL;
2174 *root_device_rw = secondary_root_rw;
2179 *home_device = home;
2182 *home_device_rw = home_rw;
2189 *srv_device_rw = srv_rw;
2194 log_error("--image= is not supported, compiled without blkid support.");
2199 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2201 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2202 const char *fstype, *p;
2212 p = strappenda(where, directory);
2217 b = blkid_new_probe_from_filename(what);
2221 log_error("Failed to allocate prober for %s: %m", what);
2225 blkid_probe_enable_superblocks(b, 1);
2226 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2229 r = blkid_do_safeprobe(b);
2230 if (r == -1 || r == 1) {
2231 log_error("Cannot determine file system type of %s", what);
2233 } else if (r != 0) {
2236 log_error("Failed to probe %s: %m", what);
2241 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2244 log_error("Failed to determine file system type of %s", what);
2248 if (streq(fstype, "crypto_LUKS")) {
2249 log_error("nspawn currently does not support LUKS disk images.");
2253 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2254 log_error("Failed to mount %s: %m", what);
2260 log_error("--image= is not supported, compiled without blkid support.");
2265 static int mount_devices(
2267 const char *root_device, bool root_device_rw,
2268 const char *home_device, bool home_device_rw,
2269 const char *srv_device, bool srv_device_rw) {
2275 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2277 log_error("Failed to mount root directory: %s", strerror(-r));
2283 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2285 log_error("Failed to mount home directory: %s", strerror(-r));
2291 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2293 log_error("Failed to mount server data directory: %s", strerror(-r));
2301 static void loop_remove(int nr, int *image_fd) {
2302 _cleanup_close_ int control = -1;
2307 if (image_fd && *image_fd >= 0) {
2308 ioctl(*image_fd, LOOP_CLR_FD);
2309 *image_fd = safe_close(*image_fd);
2312 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2316 ioctl(control, LOOP_CTL_REMOVE, nr);
2319 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2327 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2328 log_error("Failed to allocate pipe: %m");
2334 log_error("Failed to fork getent child: %m");
2336 } else if (pid == 0) {
2338 char *empty_env = NULL;
2340 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2341 _exit(EXIT_FAILURE);
2343 if (pipe_fds[0] > 2)
2344 safe_close(pipe_fds[0]);
2345 if (pipe_fds[1] > 2)
2346 safe_close(pipe_fds[1]);
2348 nullfd = open("/dev/null", O_RDWR);
2350 _exit(EXIT_FAILURE);
2352 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2353 _exit(EXIT_FAILURE);
2355 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2356 _exit(EXIT_FAILURE);
2361 reset_all_signal_handlers();
2362 close_all_fds(NULL, 0);
2364 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2365 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2366 _exit(EXIT_FAILURE);
2369 pipe_fds[1] = safe_close(pipe_fds[1]);
2376 static int change_uid_gid(char **_home) {
2377 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2378 _cleanup_free_ uid_t *uids = NULL;
2379 _cleanup_free_ char *home = NULL;
2380 _cleanup_fclose_ FILE *f = NULL;
2381 _cleanup_close_ int fd = -1;
2382 unsigned n_uids = 0;
2391 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2392 /* Reset everything fully to 0, just in case */
2394 if (setgroups(0, NULL) < 0) {
2395 log_error("setgroups() failed: %m");
2399 if (setresgid(0, 0, 0) < 0) {
2400 log_error("setregid() failed: %m");
2404 if (setresuid(0, 0, 0) < 0) {
2405 log_error("setreuid() failed: %m");
2413 /* First, get user credentials */
2414 fd = spawn_getent("passwd", arg_user, &pid);
2418 f = fdopen(fd, "r");
2423 if (!fgets(line, sizeof(line), f)) {
2426 log_error("Failed to resolve user %s.", arg_user);
2430 log_error("Failed to read from getent: %m");
2436 wait_for_terminate_and_warn("getent passwd", pid);
2438 x = strchr(line, ':');
2440 log_error("/etc/passwd entry has invalid user field.");
2444 u = strchr(x+1, ':');
2446 log_error("/etc/passwd entry has invalid password field.");
2453 log_error("/etc/passwd entry has invalid UID field.");
2461 log_error("/etc/passwd entry has invalid GID field.");
2466 h = strchr(x+1, ':');
2468 log_error("/etc/passwd entry has invalid GECOS field.");
2475 log_error("/etc/passwd entry has invalid home directory field.");
2481 r = parse_uid(u, &uid);
2483 log_error("Failed to parse UID of user.");
2487 r = parse_gid(g, &gid);
2489 log_error("Failed to parse GID of user.");
2497 /* Second, get group memberships */
2498 fd = spawn_getent("initgroups", arg_user, &pid);
2503 f = fdopen(fd, "r");
2508 if (!fgets(line, sizeof(line), f)) {
2510 log_error("Failed to resolve user %s.", arg_user);
2514 log_error("Failed to read from getent: %m");
2520 wait_for_terminate_and_warn("getent initgroups", pid);
2522 /* Skip over the username and subsequent separator whitespace */
2524 x += strcspn(x, WHITESPACE);
2525 x += strspn(x, WHITESPACE);
2527 FOREACH_WORD(w, l, x, state) {
2533 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2536 r = parse_uid(c, &uids[n_uids++]);
2538 log_error("Failed to parse group data from getent.");
2543 r = mkdir_parents(home, 0775);
2545 log_error("Failed to make home root directory: %s", strerror(-r));
2549 r = mkdir_safe(home, 0755, uid, gid);
2550 if (r < 0 && r != -EEXIST) {
2551 log_error("Failed to make home directory: %s", strerror(-r));
2555 fchown(STDIN_FILENO, uid, gid);
2556 fchown(STDOUT_FILENO, uid, gid);
2557 fchown(STDERR_FILENO, uid, gid);
2559 if (setgroups(n_uids, uids) < 0) {
2560 log_error("Failed to set auxiliary groups: %m");
2564 if (setresgid(gid, gid, gid) < 0) {
2565 log_error("setregid() failed: %m");
2569 if (setresuid(uid, uid, uid) < 0) {
2570 log_error("setreuid() failed: %m");
2583 * Return 0 in case the container is being rebooted, has been shut
2584 * down or exited successfully. On failures a negative value is
2587 * The status of the container "CONTAINER_TERMINATED" or
2588 * "CONTAINER_REBOOTED" will be saved in the container argument
2590 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2594 r = wait_for_terminate(pid, &status);
2598 switch (status.si_code) {
2600 r = status.si_status;
2603 log_debug("Container %s exited successfully.",
2606 *container = CONTAINER_TERMINATED;
2608 log_error("Container %s failed with error code %i.",
2609 arg_machine, status.si_status);
2615 if (status.si_status == SIGINT) {
2617 log_info("Container %s has been shut down.",
2620 *container = CONTAINER_TERMINATED;
2623 } else if (status.si_status == SIGHUP) {
2625 log_info("Container %s is being rebooted.",
2628 *container = CONTAINER_REBOOTED;
2632 /* CLD_KILLED fallthrough */
2635 log_error("Container %s terminated by signal %s.",
2636 arg_machine, signal_to_string(status.si_status));
2641 log_error("Container %s failed due to unknown reason.",
2650 static void nop_handler(int sig) {}
2652 int main(int argc, char *argv[]) {
2654 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2655 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2656 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2657 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2658 _cleanup_fdset_free_ FDSet *fds = NULL;
2659 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2660 const char *console = NULL;
2661 char veth_name[IFNAMSIZ];
2662 bool secondary = false;
2663 sigset_t mask, mask_chld;
2666 log_parse_environment();
2669 k = parse_argv(argc, argv);
2678 if (arg_directory) {
2681 p = path_make_absolute_cwd(arg_directory);
2682 free(arg_directory);
2685 arg_directory = get_current_dir_name();
2687 if (!arg_directory) {
2688 log_error("Failed to determine path, please use -D.");
2691 path_kill_slashes(arg_directory);
2695 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2701 hostname_cleanup(arg_machine, false);
2702 if (isempty(arg_machine)) {
2703 log_error("Failed to determine machine name automatically, please use -M.");
2708 if (geteuid() != 0) {
2709 log_error("Need to be root.");
2713 if (sd_booted() <= 0) {
2714 log_error("Not running on a systemd system.");
2719 n_fd_passed = sd_listen_fds(false);
2720 if (n_fd_passed > 0) {
2721 k = fdset_new_listen_fds(&fds, false);
2723 log_error("Failed to collect file descriptors: %s", strerror(-k));
2727 fdset_close_others(fds);
2730 if (arg_directory) {
2731 if (path_equal(arg_directory, "/")) {
2732 log_error("Spawning container on root directory not supported.");
2737 if (path_is_os_tree(arg_directory) <= 0) {
2738 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2744 p = strappenda(arg_directory,
2745 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2746 if (access(p, F_OK) < 0) {
2747 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2753 char template[] = "/tmp/nspawn-root-XXXXXX";
2755 if (!mkdtemp(template)) {
2756 log_error("Failed to create temporary directory: %m");
2761 arg_directory = strdup(template);
2762 if (!arg_directory) {
2767 image_fd = setup_image(&device_path, &loop_nr);
2773 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2778 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2780 log_error("Failed to acquire pseudo tty: %m");
2784 console = ptsname(master);
2786 log_error("Failed to determine tty name: %m");
2791 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2792 arg_machine, arg_image ? arg_image : arg_directory);
2794 if (unlockpt(master) < 0) {
2795 log_error("Failed to unlock tty: %m");
2799 if (access("/dev/kdbus/control", F_OK) >= 0) {
2801 if (arg_share_system) {
2802 kdbus_domain = strdup("/dev/kdbus");
2803 if (!kdbus_domain) {
2810 ns = strappenda("machine-", arg_machine);
2811 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2813 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2815 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2819 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2820 log_error("Failed to create kmsg socket pair: %m");
2824 sd_notify(0, "READY=1");
2826 assert_se(sigemptyset(&mask) == 0);
2827 assert_se(sigemptyset(&mask_chld) == 0);
2828 sigaddset(&mask_chld, SIGCHLD);
2829 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2830 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2833 ContainerStatus container_status;
2834 int eventfds[2] = { -1, -1 };
2835 struct sigaction sa = {
2836 .sa_handler = nop_handler,
2837 .sa_flags = SA_NOCLDSTOP,
2840 /* Child can be killed before execv(), so handle SIGCHLD
2841 * in order to interrupt parent's blocking calls and
2842 * give it a chance to call wait() and terminate. */
2843 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2845 log_error("Failed to change the signal mask: %m");
2849 r = sigaction(SIGCHLD, &sa, NULL);
2851 log_error("Failed to install SIGCHLD handler: %m");
2855 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2856 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2857 (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2859 if (errno == EINVAL)
2860 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2862 log_error("clone() failed: %m");
2870 _cleanup_free_ char *home = NULL;
2872 const char *envp[] = {
2873 "PATH=" DEFAULT_PATH_SPLIT_USR,
2874 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2879 NULL, /* container_uuid */
2880 NULL, /* LISTEN_FDS */
2881 NULL, /* LISTEN_PID */
2886 envp[n_env] = strv_find_prefix(environ, "TERM=");
2890 master = safe_close(master);
2892 close_nointr(STDIN_FILENO);
2893 close_nointr(STDOUT_FILENO);
2894 close_nointr(STDERR_FILENO);
2896 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2898 reset_all_signal_handlers();
2900 assert_se(sigemptyset(&mask) == 0);
2901 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2903 k = open_terminal(console, O_RDWR);
2904 if (k != STDIN_FILENO) {
2910 log_error("Failed to open console: %s", strerror(-k));
2914 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2915 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2916 log_error("Failed to duplicate console: %m");
2921 log_error("setsid() failed: %m");
2925 if (reset_audit_loginuid() < 0)
2928 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2929 log_error("PR_SET_PDEATHSIG failed: %m");
2933 /* Mark everything as slave, so that we still
2934 * receive mounts from the real root, but don't
2935 * propagate mounts to the real root. */
2936 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2937 log_error("MS_SLAVE|MS_REC failed: %m");
2941 if (mount_devices(arg_directory,
2942 root_device, root_device_rw,
2943 home_device, home_device_rw,
2944 srv_device, srv_device_rw) < 0)
2947 /* Turn directory into bind mount */
2948 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2949 log_error("Failed to make bind mount: %m");
2953 if (arg_read_only) {
2954 k = bind_remount_recursive(arg_directory, true);
2956 log_error("Failed to make tree read-only: %s", strerror(-k));
2961 if (mount_all(arg_directory) < 0)
2964 if (copy_devnodes(arg_directory) < 0)
2967 if (setup_ptmx(arg_directory) < 0)
2970 dev_setup(arg_directory);
2972 if (audit_still_doesnt_work_in_containers() < 0)
2975 if (setup_dev_console(arg_directory, console) < 0)
2978 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2981 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2983 if (setup_boot_id(arg_directory) < 0)
2986 if (setup_timezone(arg_directory) < 0)
2989 if (setup_resolv_conf(arg_directory) < 0)
2992 if (setup_journal(arg_directory) < 0)
2995 if (mount_binds(arg_directory, arg_bind, false) < 0)
2998 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3001 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3004 /* Tell the parent that we are ready, and that
3005 * it can cgroupify us to that we lack access
3006 * to certain devices and resources. */
3007 r = eventfd_send_state(eventfds[1],
3008 EVENTFD_CHILD_SUCCEEDED);
3009 eventfds[1] = safe_close(eventfds[1]);
3013 if (chdir(arg_directory) < 0) {
3014 log_error("chdir(%s) failed: %m", arg_directory);
3018 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3019 log_error("mount(MS_MOVE) failed: %m");
3023 if (chroot(".") < 0) {
3024 log_error("chroot() failed: %m");
3028 if (chdir("/") < 0) {
3029 log_error("chdir() failed: %m");
3035 if (arg_private_network)
3038 if (drop_capabilities() < 0) {
3039 log_error("drop_capabilities() failed: %m");
3043 r = change_uid_gid(&home);
3047 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3048 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3049 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3054 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3057 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3063 if (fdset_size(fds) > 0) {
3064 k = fdset_cloexec(fds, false);
3066 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3070 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3071 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3079 if (arg_personality != 0xffffffffLU) {
3080 if (personality(arg_personality) < 0) {
3081 log_error("personality() failed: %m");
3084 } else if (secondary) {
3085 if (personality(PER_LINUX32) < 0) {
3086 log_error("personality() failed: %m");
3092 if (arg_selinux_context)
3093 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3094 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3099 if (!strv_isempty(arg_setenv)) {
3102 n = strv_env_merge(2, envp, arg_setenv);
3110 env_use = (char**) envp;
3112 /* Wait until the parent is ready with the setup, too... */
3113 r = eventfd_parent_succeeded(eventfds[0]);
3114 eventfds[0] = safe_close(eventfds[0]);
3122 /* Automatically search for the init system */
3124 l = 1 + argc - optind;
3125 a = newa(char*, l + 1);
3126 memcpy(a + 1, argv + optind, l * sizeof(char*));
3128 a[0] = (char*) "/usr/lib/systemd/systemd";
3129 execve(a[0], a, env_use);
3131 a[0] = (char*) "/lib/systemd/systemd";
3132 execve(a[0], a, env_use);
3134 a[0] = (char*) "/sbin/init";
3135 execve(a[0], a, env_use);
3136 } else if (argc > optind)
3137 execvpe(argv[optind], argv + optind, env_use);
3139 chdir(home ? home : "/root");
3140 execle("/bin/bash", "-bash", NULL, env_use);
3141 execle("/bin/sh", "-sh", NULL, env_use);
3144 log_error("execv() failed: %m");
3147 /* Tell the parent that the setup failed, so he
3148 * can clean up resources and terminate. */
3149 if (eventfds[1] != -1)
3150 eventfd_send_state(eventfds[1],
3151 EVENTFD_CHILD_FAILED);
3152 _exit(EXIT_FAILURE);
3158 /* Wait for the child event:
3159 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3160 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3161 * it is ready with all it needs to do with priviliges.
3162 * After we got the notification we can make the process
3163 * join its cgroup which might limit what it can do */
3164 r = eventfd_child_succeeded(eventfds[1]);
3165 eventfds[1] = safe_close(eventfds[1]);
3167 goto check_container_status;
3169 r = register_machine(pid);
3173 r = move_network_interfaces(pid);
3177 r = setup_veth(pid, veth_name);
3181 r = setup_bridge(veth_name);
3185 r = setup_macvlan(pid);
3189 /* Block SIGCHLD here, before notifying child.
3190 * process_pty() will handle it with the other signals. */
3191 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3195 /* Reset signal to default */
3196 r = default_signals(SIGCHLD, -1);
3200 /* Notify the child that the parent is ready with all
3201 * its setup, and that the child can now hand over
3202 * control to the code to run inside the container. */
3203 r = eventfd_send_state(eventfds[0],
3204 EVENTFD_PARENT_SUCCEEDED);
3205 eventfds[0] = safe_close(eventfds[0]);
3209 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3218 /* Kill if it is not dead yet anyway */
3219 terminate_machine(pid);
3221 check_container_status:
3222 /* Redundant, but better safe than sorry */
3225 r = wait_for_container(pid, &container_status);
3231 } else if (container_status == CONTAINER_TERMINATED)
3234 /* CONTAINER_REBOOTED, loop again */
3238 loop_remove(loop_nr, &image_fd);
3243 free(arg_directory);
3246 strv_free(arg_setenv);
3247 strv_free(arg_network_interfaces);
3248 strv_free(arg_network_macvlan);
3249 strv_free(arg_bind);
3250 strv_free(arg_bind_ro);