1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
49 #include <selinux/selinux.h>
57 #include <blkid/blkid.h>
60 #include "sd-daemon.h"
70 #include "cgroup-util.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
79 #include "bus-error.h"
81 #include "bus-kernel.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
88 #include "siphash24.h"
90 #include "base-filesystem.h"
92 #include "event-util.h"
96 #include "seccomp-util.h"
99 typedef enum ContainerStatus {
100 CONTAINER_TERMINATED,
104 typedef enum LinkJournal {
111 typedef enum Volatile {
117 static char *arg_directory = NULL;
118 static char *arg_user = NULL;
119 static sd_id128_t arg_uuid = {};
120 static char *arg_machine = NULL;
121 static const char *arg_selinux_context = NULL;
122 static const char *arg_selinux_apifs_context = NULL;
123 static const char *arg_slice = NULL;
124 static bool arg_private_network = false;
125 static bool arg_read_only = false;
126 static bool arg_boot = false;
127 static LinkJournal arg_link_journal = LINK_AUTO;
128 static bool arg_link_journal_try = false;
129 static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
151 (1ULL << CAP_SYS_RESOURCE) |
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
154 (1ULL << CAP_AUDIT_CONTROL) |
156 static char **arg_bind = NULL;
157 static char **arg_bind_ro = NULL;
158 static char **arg_tmpfs = NULL;
159 static char **arg_setenv = NULL;
160 static bool arg_quiet = false;
161 static bool arg_share_system = false;
162 static bool arg_register = true;
163 static bool arg_keep_unit = false;
164 static char **arg_network_interfaces = NULL;
165 static char **arg_network_macvlan = NULL;
166 static bool arg_network_veth = false;
167 static const char *arg_network_bridge = NULL;
168 static unsigned long arg_personality = 0xffffffffLU;
169 static const char *arg_image = NULL;
170 static Volatile arg_volatile = VOLATILE_NO;
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " -i --image=PATH File system device or image for the container\n"
180 " -b --boot Boot up full system (i.e. invoke init)\n"
181 " -u --user=USER Run the command under specified user or uid\n"
182 " -M --machine=NAME Set the machine name for the container\n"
183 " --uuid=UUID Set a specific machine UUID for the container\n"
184 " -S --slice=SLICE Place the container in the specified slice\n"
185 " --private-network Disable network in container\n"
186 " --network-interface=INTERFACE\n"
187 " Assign an existing network interface to the\n"
189 " --network-macvlan=INTERFACE\n"
190 " Create a macvlan network interface based on an\n"
191 " existing network interface to the container\n"
192 " --network-veth Add a virtual ethernet connection between host\n"
194 " --network-bridge=INTERFACE\n"
195 " Add a virtual ethernet connection between host\n"
196 " and container and add it to an existing bridge on\n"
198 " -Z --selinux-context=SECLABEL\n"
199 " Set the SELinux security context to be used by\n"
200 " processes in the container\n"
201 " -L --selinux-apifs-context=SECLABEL\n"
202 " Set the SELinux security context to be used by\n"
203 " API/tmpfs file systems in the container\n"
204 " --capability=CAP In addition to the default, retain specified\n"
206 " --drop-capability=CAP Drop the specified capability from the default set\n"
207 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
208 " try-guest, try-host\n"
209 " -j Equivalent to --link-journal=try-guest\n"
210 " --read-only Mount the root directory read-only\n"
211 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
213 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
214 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
215 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
216 " --share-system Share system namespaces with host\n"
217 " --register=BOOLEAN Register container as machine\n"
218 " --keep-unit Do not register a scope for the machine, reuse\n"
219 " the service unit nspawn is running in\n"
220 " --volatile[=MODE] Run the system in volatile mode\n",
221 program_invocation_short_name);
224 static int parse_argv(int argc, char *argv[]) {
241 ARG_NETWORK_INTERFACE,
249 static const struct option options[] = {
250 { "help", no_argument, NULL, 'h' },
251 { "version", no_argument, NULL, ARG_VERSION },
252 { "directory", required_argument, NULL, 'D' },
253 { "user", required_argument, NULL, 'u' },
254 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
255 { "boot", no_argument, NULL, 'b' },
256 { "uuid", required_argument, NULL, ARG_UUID },
257 { "read-only", no_argument, NULL, ARG_READ_ONLY },
258 { "capability", required_argument, NULL, ARG_CAPABILITY },
259 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
260 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
261 { "bind", required_argument, NULL, ARG_BIND },
262 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
263 { "tmpfs", required_argument, NULL, ARG_TMPFS },
264 { "machine", required_argument, NULL, 'M' },
265 { "slice", required_argument, NULL, 'S' },
266 { "setenv", required_argument, NULL, ARG_SETENV },
267 { "selinux-context", required_argument, NULL, 'Z' },
268 { "selinux-apifs-context", required_argument, NULL, 'L' },
269 { "quiet", no_argument, NULL, 'q' },
270 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
271 { "register", required_argument, NULL, ARG_REGISTER },
272 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
273 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
274 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
275 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
276 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
277 { "personality", required_argument, NULL, ARG_PERSONALITY },
278 { "image", required_argument, NULL, 'i' },
279 { "volatile", optional_argument, NULL, ARG_VOLATILE },
284 uint64_t plus = 0, minus = 0;
289 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
298 puts(PACKAGE_STRING);
299 puts(SYSTEMD_FEATURES);
304 arg_directory = canonicalize_file_name(optarg);
305 if (!arg_directory) {
306 log_error_errno(errno, "Invalid root directory: %m");
318 arg_user = strdup(optarg);
324 case ARG_NETWORK_BRIDGE:
325 arg_network_bridge = optarg;
329 case ARG_NETWORK_VETH:
330 arg_network_veth = true;
331 arg_private_network = true;
334 case ARG_NETWORK_INTERFACE:
335 if (strv_extend(&arg_network_interfaces, optarg) < 0)
338 arg_private_network = true;
341 case ARG_NETWORK_MACVLAN:
342 if (strv_extend(&arg_network_macvlan, optarg) < 0)
347 case ARG_PRIVATE_NETWORK:
348 arg_private_network = true;
356 r = sd_id128_from_string(optarg, &arg_uuid);
358 log_error("Invalid UUID: %s", optarg);
368 if (isempty(optarg)) {
372 if (!machine_name_is_valid(optarg)) {
373 log_error("Invalid machine name: %s", optarg);
377 r = free_and_strdup(&arg_machine, optarg);
385 arg_selinux_context = optarg;
389 arg_selinux_apifs_context = optarg;
393 arg_read_only = true;
397 case ARG_DROP_CAPABILITY: {
398 const char *state, *word;
401 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
402 _cleanup_free_ char *t;
404 t = strndup(word, length);
408 if (streq(t, "all")) {
409 if (c == ARG_CAPABILITY)
410 plus = (uint64_t) -1;
412 minus = (uint64_t) -1;
416 cap = capability_from_name(t);
418 log_error("Failed to parse capability %s.", t);
422 if (c == ARG_CAPABILITY)
423 plus |= 1ULL << (uint64_t) cap;
425 minus |= 1ULL << (uint64_t) cap;
433 arg_link_journal = LINK_GUEST;
434 arg_link_journal_try = true;
437 case ARG_LINK_JOURNAL:
438 if (streq(optarg, "auto"))
439 arg_link_journal = LINK_AUTO;
440 else if (streq(optarg, "no"))
441 arg_link_journal = LINK_NO;
442 else if (streq(optarg, "guest"))
443 arg_link_journal = LINK_GUEST;
444 else if (streq(optarg, "host"))
445 arg_link_journal = LINK_HOST;
446 else if (streq(optarg, "try-guest")) {
447 arg_link_journal = LINK_GUEST;
448 arg_link_journal_try = true;
449 } else if (streq(optarg, "try-host")) {
450 arg_link_journal = LINK_HOST;
451 arg_link_journal_try = true;
453 log_error("Failed to parse link journal mode %s", optarg);
461 _cleanup_free_ char *a = NULL, *b = NULL;
465 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
467 e = strchr(optarg, ':');
469 a = strndup(optarg, e - optarg);
479 if (!path_is_absolute(a) || !path_is_absolute(b)) {
480 log_error("Invalid bind mount specification: %s", optarg);
484 r = strv_extend(x, a);
488 r = strv_extend(x, b);
496 _cleanup_free_ char *a = NULL, *b = NULL;
499 e = strchr(optarg, ':');
501 a = strndup(optarg, e - optarg);
505 b = strdup("mode=0755");
511 if (!path_is_absolute(a)) {
512 log_error("Invalid tmpfs specification: %s", optarg);
516 r = strv_push(&arg_tmpfs, a);
522 r = strv_push(&arg_tmpfs, b);
534 if (!env_assignment_is_valid(optarg)) {
535 log_error("Environment variable assignment '%s' is not valid.", optarg);
539 n = strv_env_set(arg_setenv, optarg);
543 strv_free(arg_setenv);
552 case ARG_SHARE_SYSTEM:
553 arg_share_system = true;
557 r = parse_boolean(optarg);
559 log_error("Failed to parse --register= argument: %s", optarg);
567 arg_keep_unit = true;
570 case ARG_PERSONALITY:
572 arg_personality = personality_from_string(optarg);
573 if (arg_personality == 0xffffffffLU) {
574 log_error("Unknown or unsupported personality '%s'.", optarg);
583 arg_volatile = VOLATILE_YES;
585 r = parse_boolean(optarg);
587 if (streq(optarg, "state"))
588 arg_volatile = VOLATILE_STATE;
590 log_error("Failed to parse --volatile= argument: %s", optarg);
594 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
603 assert_not_reached("Unhandled option");
606 if (arg_share_system)
607 arg_register = false;
609 if (arg_boot && arg_share_system) {
610 log_error("--boot and --share-system may not be combined.");
614 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
615 log_error("--keep-unit may not be used when invoked from a user session.");
619 if (arg_directory && arg_image) {
620 log_error("--directory= and --image= may not be combined.");
624 if (arg_volatile != VOLATILE_NO && arg_read_only) {
625 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
629 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
634 static int mount_all(const char *dest) {
636 typedef struct MountPoint {
645 static const MountPoint mount_table[] = {
646 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
647 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
648 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
649 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
650 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
651 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
652 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
653 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
655 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
656 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
663 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
664 _cleanup_free_ char *where = NULL;
666 _cleanup_free_ char *options = NULL;
671 where = strjoin(dest, "/", mount_table[k].where, NULL);
675 t = path_is_mount_point(where, true);
677 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
685 /* Skip this entry if it is not a remount. */
686 if (mount_table[k].what && t > 0)
689 t = mkdir_p(where, 0755);
691 if (mount_table[k].fatal) {
692 log_error_errno(t, "Failed to create directory %s: %m", where);
697 log_warning_errno(t, "Failed to create directory %s: %m", where);
703 if (arg_selinux_apifs_context &&
704 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
705 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
712 o = mount_table[k].options;
715 if (mount(mount_table[k].what,
718 mount_table[k].flags,
721 if (mount_table[k].fatal) {
722 log_error_errno(errno, "mount(%s) failed: %m", where);
727 log_warning_errno(errno, "mount(%s) failed: %m", where);
734 static int mount_binds(const char *dest, char **l, bool ro) {
737 STRV_FOREACH_PAIR(x, y, l) {
738 _cleanup_free_ char *where = NULL;
739 struct stat source_st, dest_st;
742 if (stat(*x, &source_st) < 0)
743 return log_error_errno(errno, "Failed to stat %s: %m", *x);
745 where = strappend(dest, *y);
749 r = stat(where, &dest_st);
751 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
752 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
755 } else if (errno == ENOENT) {
756 r = mkdir_parents_label(where, 0755);
758 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
760 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
764 /* Create the mount point, but be conservative -- refuse to create block
765 * and char devices. */
766 if (S_ISDIR(source_st.st_mode)) {
767 r = mkdir_label(where, 0755);
768 if (r < 0 && errno != EEXIST)
769 return log_error_errno(r, "Failed to create mount point %s: %m", where);
770 } else if (S_ISFIFO(source_st.st_mode)) {
771 r = mkfifo(where, 0644);
772 if (r < 0 && errno != EEXIST)
773 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
774 } else if (S_ISSOCK(source_st.st_mode)) {
775 r = mknod(where, 0644 | S_IFSOCK, 0);
776 if (r < 0 && errno != EEXIST)
777 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
778 } else if (S_ISREG(source_st.st_mode)) {
781 return log_error_errno(r, "Failed to create mount point %s: %m", where);
783 log_error("Refusing to create mountpoint for file: %s", *x);
787 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
788 return log_error_errno(errno, "mount(%s) failed: %m", where);
791 r = bind_remount_recursive(where, true);
793 return log_error_errno(r, "Read-Only bind mount failed: %m");
800 static int mount_tmpfs(const char *dest) {
803 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
804 _cleanup_free_ char *where = NULL;
807 where = strappend(dest, *i);
811 r = mkdir_label(where, 0755);
812 if (r < 0 && r != -EEXIST)
813 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
815 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
816 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
822 static int setup_timezone(const char *dest) {
823 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
829 /* Fix the timezone, if possible */
830 r = readlink_malloc("/etc/localtime", &p);
832 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
836 z = path_startswith(p, "../usr/share/zoneinfo/");
838 z = path_startswith(p, "/usr/share/zoneinfo/");
840 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
844 where = strappend(dest, "/etc/localtime");
848 r = readlink_malloc(where, &q);
850 y = path_startswith(q, "../usr/share/zoneinfo/");
852 y = path_startswith(q, "/usr/share/zoneinfo/");
854 /* Already pointing to the right place? Then do nothing .. */
855 if (y && streq(y, z))
859 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
863 if (access(check, F_OK) < 0) {
864 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
868 what = strappend("../usr/share/zoneinfo/", z);
872 r = mkdir_parents(where, 0755);
874 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
880 if (r < 0 && errno != ENOENT) {
881 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
886 if (symlink(what, where) < 0) {
887 log_error_errno(errno, "Failed to correct timezone of container: %m");
894 static int setup_resolv_conf(const char *dest) {
895 _cleanup_free_ char *where = NULL;
900 if (arg_private_network)
903 /* Fix resolv.conf, if possible */
904 where = strappend(dest, "/etc/resolv.conf");
908 /* We don't really care for the results of this really. If it
909 * fails, it fails, but meh... */
910 r = mkdir_parents(where, 0755);
912 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
917 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
919 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
927 static int setup_volatile_state(const char *directory) {
933 if (arg_volatile != VOLATILE_STATE)
936 /* --volatile=state means we simply overmount /var
937 with a tmpfs, and the rest read-only. */
939 r = bind_remount_recursive(directory, true);
941 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
943 p = strappenda(directory, "/var");
945 if (r < 0 && errno != EEXIST)
946 return log_error_errno(errno, "Failed to create %s: %m", directory);
948 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
949 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
954 static int setup_volatile(const char *directory) {
955 bool tmpfs_mounted = false, bind_mounted = false;
956 char template[] = "/tmp/nspawn-volatile-XXXXXX";
962 if (arg_volatile != VOLATILE_YES)
965 /* --volatile=yes means we mount a tmpfs to the root dir, and
966 the original /usr to use inside it, and that read-only. */
968 if (!mkdtemp(template))
969 return log_error_errno(errno, "Failed to create temporary directory: %m");
971 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
972 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
977 tmpfs_mounted = true;
979 f = strappenda(directory, "/usr");
980 t = strappenda(template, "/usr");
983 if (r < 0 && errno != EEXIST) {
984 log_error_errno(errno, "Failed to create %s: %m", t);
989 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
990 log_error_errno(errno, "Failed to create /usr bind mount: %m");
997 r = bind_remount_recursive(t, true);
999 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1003 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1004 log_error_errno(errno, "Failed to move root mount: %m");
1022 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1025 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1026 SD_ID128_FORMAT_VAL(id));
1031 static int setup_boot_id(const char *dest) {
1032 _cleanup_free_ char *from = NULL, *to = NULL;
1033 sd_id128_t rnd = {};
1039 if (arg_share_system)
1042 /* Generate a new randomized boot ID, so that each boot-up of
1043 * the container gets a new one */
1045 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1046 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1050 r = sd_id128_randomize(&rnd);
1052 return log_error_errno(r, "Failed to generate random boot id: %m");
1054 id128_format_as_uuid(rnd, as_uuid);
1056 r = write_string_file(from, as_uuid);
1058 return log_error_errno(r, "Failed to write boot id: %m");
1060 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1061 log_error_errno(errno, "Failed to bind mount boot id: %m");
1063 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1064 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1070 static int copy_devnodes(const char *dest) {
1072 static const char devnodes[] =
1083 _cleanup_umask_ mode_t u;
1089 NULSTR_FOREACH(d, devnodes) {
1090 _cleanup_free_ char *from = NULL, *to = NULL;
1093 from = strappend("/dev/", d);
1094 to = strjoin(dest, "/dev/", d, NULL);
1098 if (stat(from, &st) < 0) {
1100 if (errno != ENOENT)
1101 return log_error_errno(errno, "Failed to stat %s: %m", from);
1103 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1105 log_error("%s is not a char or block device, cannot copy", from);
1109 r = mkdir_parents(to, 0775);
1111 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1115 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1116 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1123 static int setup_ptmx(const char *dest) {
1124 _cleanup_free_ char *p = NULL;
1126 p = strappend(dest, "/dev/ptmx");
1130 if (symlink("pts/ptmx", p) < 0)
1131 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1136 static int setup_dev_console(const char *dest, const char *console) {
1137 _cleanup_umask_ mode_t u;
1147 if (stat("/dev/null", &st) < 0)
1148 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1150 r = chmod_and_chown(console, 0600, 0, 0);
1152 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1154 /* We need to bind mount the right tty to /dev/console since
1155 * ptys can only exist on pts file systems. To have something
1156 * to bind mount things on we create a device node first, and
1157 * use /dev/null for that since we the cgroups device policy
1158 * allows us to create that freely, while we cannot create
1159 * /dev/console. (Note that the major minor doesn't actually
1160 * matter here, since we mount it over anyway). */
1162 to = strappenda(dest, "/dev/console");
1163 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1164 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1166 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1167 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1172 static int setup_kmsg(const char *dest, int kmsg_socket) {
1173 _cleanup_free_ char *from = NULL, *to = NULL;
1175 _cleanup_umask_ mode_t u;
1177 struct cmsghdr cmsghdr;
1178 uint8_t buf[CMSG_SPACE(sizeof(int))];
1180 struct msghdr mh = {
1181 .msg_control = &control,
1182 .msg_controllen = sizeof(control),
1184 struct cmsghdr *cmsg;
1187 assert(kmsg_socket >= 0);
1191 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1192 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1193 * on the reading side behave very similar to /proc/kmsg,
1194 * their writing side behaves differently from /dev/kmsg in
1195 * that writing blocks when nothing is reading. In order to
1196 * avoid any problems with containers deadlocking due to this
1197 * we simply make /dev/kmsg unavailable to the container. */
1198 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1199 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1202 if (mkfifo(from, 0600) < 0)
1203 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1205 r = chmod_and_chown(from, 0600, 0, 0);
1207 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1209 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1210 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1212 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1214 return log_error_errno(errno, "Failed to open fifo: %m");
1216 cmsg = CMSG_FIRSTHDR(&mh);
1217 cmsg->cmsg_level = SOL_SOCKET;
1218 cmsg->cmsg_type = SCM_RIGHTS;
1219 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1220 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1222 mh.msg_controllen = cmsg->cmsg_len;
1224 /* Store away the fd in the socket, so that it stays open as
1225 * long as we run the child */
1226 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1230 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1232 /* And now make the FIFO unavailable as /dev/kmsg... */
1237 static int setup_hostname(void) {
1239 if (arg_share_system)
1242 if (sethostname_idempotent(arg_machine) < 0)
1248 static int setup_journal(const char *directory) {
1249 sd_id128_t machine_id, this_id;
1250 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1254 p = strappend(directory, "/etc/machine-id");
1258 r = read_one_line_file(p, &b);
1259 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1262 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1265 if (isempty(id) && arg_link_journal == LINK_AUTO)
1268 /* Verify validity */
1269 r = sd_id128_from_string(id, &machine_id);
1271 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1273 r = sd_id128_get_machine(&this_id);
1275 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1277 if (sd_id128_equal(machine_id, this_id)) {
1278 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1279 "Host and machine ids are equal (%s): refusing to link journals", id);
1280 if (arg_link_journal == LINK_AUTO)
1286 if (arg_link_journal == LINK_NO)
1290 p = strappend("/var/log/journal/", id);
1291 q = strjoin(directory, "/var/log/journal/", id, NULL);
1295 if (path_is_mount_point(p, false) > 0) {
1296 if (arg_link_journal != LINK_AUTO) {
1297 log_error("%s: already a mount point, refusing to use for journal", p);
1304 if (path_is_mount_point(q, false) > 0) {
1305 if (arg_link_journal != LINK_AUTO) {
1306 log_error("%s: already a mount point, refusing to use for journal", q);
1313 r = readlink_and_make_absolute(p, &d);
1315 if ((arg_link_journal == LINK_GUEST ||
1316 arg_link_journal == LINK_AUTO) &&
1319 r = mkdir_p(q, 0755);
1321 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1326 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1327 } else if (r == -EINVAL) {
1329 if (arg_link_journal == LINK_GUEST &&
1332 if (errno == ENOTDIR) {
1333 log_error("%s already exists and is neither a symlink nor a directory", p);
1336 log_error_errno(errno, "Failed to remove %s: %m", p);
1340 } else if (r != -ENOENT) {
1341 log_error_errno(errno, "readlink(%s) failed: %m", p);
1345 if (arg_link_journal == LINK_GUEST) {
1347 if (symlink(q, p) < 0) {
1348 if (arg_link_journal_try) {
1349 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1352 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1357 r = mkdir_p(q, 0755);
1359 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1363 if (arg_link_journal == LINK_HOST) {
1364 /* don't create parents here -- if the host doesn't have
1365 * permanent journal set up, don't force it here */
1368 if (arg_link_journal_try) {
1369 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1372 log_error_errno(errno, "Failed to create %s: %m", p);
1377 } else if (access(p, F_OK) < 0)
1380 if (dir_is_empty(q) == 0)
1381 log_warning("%s is not empty, proceeding anyway.", q);
1383 r = mkdir_p(q, 0755);
1385 log_error_errno(errno, "Failed to create %s: %m", q);
1389 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1390 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1395 static int drop_capabilities(void) {
1396 return capability_bounding_set_drop(~arg_retain, false);
1399 static int register_machine(pid_t pid, int local_ifindex) {
1400 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1401 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1407 r = sd_bus_default_system(&bus);
1409 return log_error_errno(r, "Failed to open system bus: %m");
1411 if (arg_keep_unit) {
1412 r = sd_bus_call_method(
1414 "org.freedesktop.machine1",
1415 "/org/freedesktop/machine1",
1416 "org.freedesktop.machine1.Manager",
1417 "RegisterMachineWithNetwork",
1422 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1426 strempty(arg_directory),
1427 local_ifindex > 0 ? 1 : 0, local_ifindex);
1429 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1431 r = sd_bus_message_new_method_call(
1434 "org.freedesktop.machine1",
1435 "/org/freedesktop/machine1",
1436 "org.freedesktop.machine1.Manager",
1437 "CreateMachineWithNetwork");
1439 return log_error_errno(r, "Failed to create message: %m");
1441 r = sd_bus_message_append(
1445 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1449 strempty(arg_directory),
1450 local_ifindex > 0 ? 1 : 0, local_ifindex);
1452 return log_error_errno(r, "Failed to append message arguments: %m");
1454 r = sd_bus_message_open_container(m, 'a', "(sv)");
1456 return log_error_errno(r, "Failed to open container: %m");
1458 if (!isempty(arg_slice)) {
1459 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1461 return log_error_errno(r, "Failed to append slice: %m");
1464 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1466 return log_error_errno(r, "Failed to add device policy: %m");
1468 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1469 /* Allow the container to
1470 * access and create the API
1471 * device nodes, so that
1472 * PrivateDevices= in the
1473 * container can work
1478 "/dev/random", "rwm",
1479 "/dev/urandom", "rwm",
1481 "/dev/net/tun", "rwm",
1482 /* Allow the container
1483 * access to ptys. However,
1485 * container to ever create
1486 * these device nodes. */
1487 "/dev/pts/ptmx", "rw",
1490 return log_error_errno(r, "Failed to add device whitelist: %m");
1492 r = sd_bus_message_close_container(m);
1494 return log_error_errno(r, "Failed to close container: %m");
1496 r = sd_bus_call(bus, m, 0, &error, NULL);
1500 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1507 static int terminate_machine(pid_t pid) {
1508 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1509 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1510 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1517 r = sd_bus_default_system(&bus);
1519 return log_error_errno(r, "Failed to open system bus: %m");
1521 r = sd_bus_call_method(
1523 "org.freedesktop.machine1",
1524 "/org/freedesktop/machine1",
1525 "org.freedesktop.machine1.Manager",
1532 /* Note that the machine might already have been
1533 * cleaned up automatically, hence don't consider it a
1534 * failure if we cannot get the machine object. */
1535 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1539 r = sd_bus_message_read(reply, "o", &path);
1541 return bus_log_parse_error(r);
1543 r = sd_bus_call_method(
1545 "org.freedesktop.machine1",
1547 "org.freedesktop.machine1.Machine",
1553 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1560 static int reset_audit_loginuid(void) {
1561 _cleanup_free_ char *p = NULL;
1564 if (arg_share_system)
1567 r = read_one_line_file("/proc/self/loginuid", &p);
1571 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1573 /* Already reset? */
1574 if (streq(p, "4294967295"))
1577 r = write_string_file("/proc/self/loginuid", "4294967295");
1579 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1580 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1581 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1582 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1583 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1591 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1592 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1593 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1595 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1601 l = strlen(arg_machine);
1602 sz = sizeof(sd_id128_t) + l;
1608 /* fetch some persistent data unique to the host */
1609 r = sd_id128_get_machine((sd_id128_t*) v);
1613 /* combine with some data unique (on this host) to this
1614 * container instance */
1615 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1618 memcpy(i, &idx, sizeof(idx));
1621 /* Let's hash the host machine ID plus the container name. We
1622 * use a fixed, but originally randomly created hash key here. */
1623 siphash24(result, v, sz, hash_key.bytes);
1625 assert_cc(ETH_ALEN <= sizeof(result));
1626 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1628 /* see eth_random_addr in the kernel */
1629 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1630 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1635 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1636 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1637 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1638 struct ether_addr mac_host, mac_container;
1641 if (!arg_private_network)
1644 if (!arg_network_veth)
1647 /* Use two different interface name prefixes depending whether
1648 * we are in bridge mode or not. */
1649 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1650 arg_network_bridge ? "vb" : "ve", arg_machine);
1652 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1654 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1656 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1658 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1660 r = sd_rtnl_open(&rtnl, 0);
1662 return log_error_errno(r, "Failed to connect to netlink: %m");
1664 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1666 return log_error_errno(r, "Failed to allocate netlink message: %m");
1668 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1670 return log_error_errno(r, "Failed to add netlink interface name: %m");
1672 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1674 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1676 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1678 return log_error_errno(r, "Failed to open netlink container: %m");
1680 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1682 return log_error_errno(r, "Failed to open netlink container: %m");
1684 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1686 return log_error_errno(r, "Failed to open netlink container: %m");
1688 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1690 return log_error_errno(r, "Failed to add netlink interface name: %m");
1692 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1694 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1696 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1698 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1700 r = sd_rtnl_message_close_container(m);
1702 return log_error_errno(r, "Failed to close netlink container: %m");
1704 r = sd_rtnl_message_close_container(m);
1706 return log_error_errno(r, "Failed to close netlink container: %m");
1708 r = sd_rtnl_message_close_container(m);
1710 return log_error_errno(r, "Failed to close netlink container: %m");
1712 r = sd_rtnl_call(rtnl, m, 0, NULL);
1714 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1716 i = (int) if_nametoindex(iface_name);
1718 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1725 static int setup_bridge(const char veth_name[], int *ifi) {
1726 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1727 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730 if (!arg_private_network)
1733 if (!arg_network_veth)
1736 if (!arg_network_bridge)
1739 bridge = (int) if_nametoindex(arg_network_bridge);
1741 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1745 r = sd_rtnl_open(&rtnl, 0);
1747 return log_error_errno(r, "Failed to connect to netlink: %m");
1749 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1751 return log_error_errno(r, "Failed to allocate netlink message: %m");
1753 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1755 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1757 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1759 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1761 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1763 return log_error_errno(r, "Failed to add netlink master field: %m");
1765 r = sd_rtnl_call(rtnl, m, 0, NULL);
1767 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1772 static int parse_interface(struct udev *udev, const char *name) {
1773 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1774 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1777 ifi = (int) if_nametoindex(name);
1779 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1781 sprintf(ifi_str, "n%i", ifi);
1782 d = udev_device_new_from_device_id(udev, ifi_str);
1784 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1786 if (udev_device_get_is_initialized(d) <= 0) {
1787 log_error("Network interface %s is not initialized yet.", name);
1794 static int move_network_interfaces(pid_t pid) {
1795 _cleanup_udev_unref_ struct udev *udev = NULL;
1796 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1800 if (!arg_private_network)
1803 if (strv_isempty(arg_network_interfaces))
1806 r = sd_rtnl_open(&rtnl, 0);
1808 return log_error_errno(r, "Failed to connect to netlink: %m");
1812 log_error("Failed to connect to udev.");
1816 STRV_FOREACH(i, arg_network_interfaces) {
1817 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1820 ifi = parse_interface(udev, *i);
1824 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1826 return log_error_errno(r, "Failed to allocate netlink message: %m");
1828 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1830 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1832 r = sd_rtnl_call(rtnl, m, 0, NULL);
1834 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1840 static int setup_macvlan(pid_t pid) {
1841 _cleanup_udev_unref_ struct udev *udev = NULL;
1842 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1847 if (!arg_private_network)
1850 if (strv_isempty(arg_network_macvlan))
1853 r = sd_rtnl_open(&rtnl, 0);
1855 return log_error_errno(r, "Failed to connect to netlink: %m");
1859 log_error("Failed to connect to udev.");
1863 STRV_FOREACH(i, arg_network_macvlan) {
1864 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1865 _cleanup_free_ char *n = NULL;
1866 struct ether_addr mac;
1869 ifi = parse_interface(udev, *i);
1873 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1875 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1877 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1879 return log_error_errno(r, "Failed to allocate netlink message: %m");
1881 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1883 return log_error_errno(r, "Failed to add netlink interface index: %m");
1885 n = strappend("mv-", *i);
1889 strshorten(n, IFNAMSIZ-1);
1891 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1893 return log_error_errno(r, "Failed to add netlink interface name: %m");
1895 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1897 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1899 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1901 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1903 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1905 return log_error_errno(r, "Failed to open netlink container: %m");
1907 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1909 return log_error_errno(r, "Failed to open netlink container: %m");
1911 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1913 return log_error_errno(r, "Failed to append macvlan mode: %m");
1915 r = sd_rtnl_message_close_container(m);
1917 return log_error_errno(r, "Failed to close netlink container: %m");
1919 r = sd_rtnl_message_close_container(m);
1921 return log_error_errno(r, "Failed to close netlink container: %m");
1923 r = sd_rtnl_call(rtnl, m, 0, NULL);
1925 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1931 static int setup_seccomp(void) {
1934 static const int blacklist[] = {
1935 SCMP_SYS(kexec_load),
1936 SCMP_SYS(open_by_handle_at),
1937 SCMP_SYS(init_module),
1938 SCMP_SYS(finit_module),
1939 SCMP_SYS(delete_module),
1946 scmp_filter_ctx seccomp;
1950 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1954 r = seccomp_add_secondary_archs(seccomp);
1956 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1960 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1961 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1963 continue; /* unknown syscall */
1965 log_error_errno(r, "Failed to block syscall: %m");
1971 Audit is broken in containers, much of the userspace audit
1972 hookup will fail if running inside a container. We don't
1973 care and just turn off creation of audit sockets.
1975 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1976 with EAFNOSUPPORT which audit userspace uses as indication
1977 that audit is disabled in the kernel.
1980 r = seccomp_rule_add(
1982 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1985 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1986 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1988 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1992 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1994 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1998 r = seccomp_load(seccomp);
2000 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2003 seccomp_release(seccomp);
2011 static int setup_image(char **device_path, int *loop_nr) {
2012 struct loop_info64 info = {
2013 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2015 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2016 _cleanup_free_ char* loopdev = NULL;
2020 assert(device_path);
2023 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2025 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2027 if (fstat(fd, &st) < 0)
2028 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2030 if (S_ISBLK(st.st_mode)) {
2033 p = strdup(arg_image);
2047 if (!S_ISREG(st.st_mode)) {
2048 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2052 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2054 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2056 nr = ioctl(control, LOOP_CTL_GET_FREE);
2058 return log_error_errno(errno, "Failed to allocate loop device: %m");
2060 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2063 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2065 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2067 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2068 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2071 info.lo_flags |= LO_FLAGS_READ_ONLY;
2073 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2074 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2076 *device_path = loopdev;
2087 static int dissect_image(
2089 char **root_device, bool *root_device_rw,
2090 char **home_device, bool *home_device_rw,
2091 char **srv_device, bool *srv_device_rw,
2095 int home_nr = -1, srv_nr = -1;
2096 #ifdef GPT_ROOT_NATIVE
2099 #ifdef GPT_ROOT_SECONDARY
2100 int secondary_root_nr = -1;
2103 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2104 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2105 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2106 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2107 _cleanup_udev_unref_ struct udev *udev = NULL;
2108 struct udev_list_entry *first, *item;
2109 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2110 const char *pttype = NULL;
2116 assert(root_device);
2117 assert(home_device);
2121 b = blkid_new_probe();
2126 r = blkid_probe_set_device(b, fd, 0, 0);
2131 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2135 blkid_probe_enable_partitions(b, 1);
2136 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2139 r = blkid_do_safeprobe(b);
2140 if (r == -2 || r == 1) {
2141 log_error("Failed to identify any partition table on %s.\n"
2142 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2144 } else if (r != 0) {
2147 log_error_errno(errno, "Failed to probe: %m");
2151 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2152 if (!streq_ptr(pttype, "gpt")) {
2153 log_error("Image %s does not carry a GUID Partition Table.\n"
2154 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2159 pl = blkid_probe_get_partitions(b);
2164 log_error("Failed to list partitions of %s", arg_image);
2172 if (fstat(fd, &st) < 0)
2173 return log_error_errno(errno, "Failed to stat block device: %m");
2175 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2179 e = udev_enumerate_new(udev);
2183 r = udev_enumerate_add_match_parent(e, d);
2187 r = udev_enumerate_scan_devices(e);
2189 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2191 first = udev_enumerate_get_list_entry(e);
2192 udev_list_entry_foreach(item, first) {
2193 _cleanup_udev_device_unref_ struct udev_device *q;
2194 const char *stype, *node;
2195 unsigned long long flags;
2202 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2207 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2211 qn = udev_device_get_devnum(q);
2215 if (st.st_rdev == qn)
2218 node = udev_device_get_devnode(q);
2222 pp = blkid_partlist_devno_to_partition(pl, qn);
2226 flags = blkid_partition_get_flags(pp);
2227 if (flags & GPT_FLAG_NO_AUTO)
2230 nr = blkid_partition_get_partno(pp);
2234 stype = blkid_partition_get_type_string(pp);
2238 if (sd_id128_from_string(stype, &type_id) < 0)
2241 if (sd_id128_equal(type_id, GPT_HOME)) {
2243 if (home && nr >= home_nr)
2247 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2250 home = strdup(node);
2253 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2255 if (srv && nr >= srv_nr)
2259 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2266 #ifdef GPT_ROOT_NATIVE
2267 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2269 if (root && nr >= root_nr)
2273 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2276 root = strdup(node);
2281 #ifdef GPT_ROOT_SECONDARY
2282 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2284 if (secondary_root && nr >= secondary_root_nr)
2287 secondary_root_nr = nr;
2288 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2291 free(secondary_root);
2292 secondary_root = strdup(node);
2293 if (!secondary_root)
2299 if (!root && !secondary_root) {
2300 log_error("Failed to identify root partition in disk image %s.\n"
2301 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2306 *root_device = root;
2309 *root_device_rw = root_rw;
2311 } else if (secondary_root) {
2312 *root_device = secondary_root;
2313 secondary_root = NULL;
2315 *root_device_rw = secondary_root_rw;
2320 *home_device = home;
2323 *home_device_rw = home_rw;
2330 *srv_device_rw = srv_rw;
2335 log_error("--image= is not supported, compiled without blkid support.");
2340 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2342 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2343 const char *fstype, *p;
2353 p = strappenda(where, directory);
2358 b = blkid_new_probe_from_filename(what);
2362 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2366 blkid_probe_enable_superblocks(b, 1);
2367 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2370 r = blkid_do_safeprobe(b);
2371 if (r == -1 || r == 1) {
2372 log_error("Cannot determine file system type of %s", what);
2374 } else if (r != 0) {
2377 log_error_errno(errno, "Failed to probe %s: %m", what);
2382 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2385 log_error("Failed to determine file system type of %s", what);
2389 if (streq(fstype, "crypto_LUKS")) {
2390 log_error("nspawn currently does not support LUKS disk images.");
2394 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2395 return log_error_errno(errno, "Failed to mount %s: %m", what);
2399 log_error("--image= is not supported, compiled without blkid support.");
2404 static int mount_devices(
2406 const char *root_device, bool root_device_rw,
2407 const char *home_device, bool home_device_rw,
2408 const char *srv_device, bool srv_device_rw) {
2414 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2416 return log_error_errno(r, "Failed to mount root directory: %m");
2420 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2422 return log_error_errno(r, "Failed to mount home directory: %m");
2426 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2428 return log_error_errno(r, "Failed to mount server data directory: %m");
2434 static void loop_remove(int nr, int *image_fd) {
2435 _cleanup_close_ int control = -1;
2441 if (image_fd && *image_fd >= 0) {
2442 r = ioctl(*image_fd, LOOP_CLR_FD);
2444 log_warning_errno(errno, "Failed to close loop image: %m");
2445 *image_fd = safe_close(*image_fd);
2448 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2450 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2454 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2456 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2459 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2467 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2468 return log_error_errno(errno, "Failed to allocate pipe: %m");
2472 return log_error_errno(errno, "Failed to fork getent child: %m");
2473 else if (pid == 0) {
2475 char *empty_env = NULL;
2477 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2478 _exit(EXIT_FAILURE);
2480 if (pipe_fds[0] > 2)
2481 safe_close(pipe_fds[0]);
2482 if (pipe_fds[1] > 2)
2483 safe_close(pipe_fds[1]);
2485 nullfd = open("/dev/null", O_RDWR);
2487 _exit(EXIT_FAILURE);
2489 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2490 _exit(EXIT_FAILURE);
2492 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2493 _exit(EXIT_FAILURE);
2498 reset_all_signal_handlers();
2499 close_all_fds(NULL, 0);
2501 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2502 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2503 _exit(EXIT_FAILURE);
2506 pipe_fds[1] = safe_close(pipe_fds[1]);
2513 static int change_uid_gid(char **_home) {
2514 char line[LINE_MAX], *x, *u, *g, *h;
2515 const char *word, *state;
2516 _cleanup_free_ uid_t *uids = NULL;
2517 _cleanup_free_ char *home = NULL;
2518 _cleanup_fclose_ FILE *f = NULL;
2519 _cleanup_close_ int fd = -1;
2520 unsigned n_uids = 0;
2529 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2530 /* Reset everything fully to 0, just in case */
2532 if (setgroups(0, NULL) < 0)
2533 return log_error_errno(errno, "setgroups() failed: %m");
2535 if (setresgid(0, 0, 0) < 0)
2536 return log_error_errno(errno, "setregid() failed: %m");
2538 if (setresuid(0, 0, 0) < 0)
2539 return log_error_errno(errno, "setreuid() failed: %m");
2545 /* First, get user credentials */
2546 fd = spawn_getent("passwd", arg_user, &pid);
2550 f = fdopen(fd, "r");
2555 if (!fgets(line, sizeof(line), f)) {
2558 log_error("Failed to resolve user %s.", arg_user);
2562 log_error_errno(errno, "Failed to read from getent: %m");
2568 wait_for_terminate_and_warn("getent passwd", pid, true);
2570 x = strchr(line, ':');
2572 log_error("/etc/passwd entry has invalid user field.");
2576 u = strchr(x+1, ':');
2578 log_error("/etc/passwd entry has invalid password field.");
2585 log_error("/etc/passwd entry has invalid UID field.");
2593 log_error("/etc/passwd entry has invalid GID field.");
2598 h = strchr(x+1, ':');
2600 log_error("/etc/passwd entry has invalid GECOS field.");
2607 log_error("/etc/passwd entry has invalid home directory field.");
2613 r = parse_uid(u, &uid);
2615 log_error("Failed to parse UID of user.");
2619 r = parse_gid(g, &gid);
2621 log_error("Failed to parse GID of user.");
2629 /* Second, get group memberships */
2630 fd = spawn_getent("initgroups", arg_user, &pid);
2635 f = fdopen(fd, "r");
2640 if (!fgets(line, sizeof(line), f)) {
2642 log_error("Failed to resolve user %s.", arg_user);
2646 log_error_errno(errno, "Failed to read from getent: %m");
2652 wait_for_terminate_and_warn("getent initgroups", pid, true);
2654 /* Skip over the username and subsequent separator whitespace */
2656 x += strcspn(x, WHITESPACE);
2657 x += strspn(x, WHITESPACE);
2659 FOREACH_WORD(word, l, x, state) {
2665 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2668 r = parse_uid(c, &uids[n_uids++]);
2670 log_error("Failed to parse group data from getent.");
2675 r = mkdir_parents(home, 0775);
2677 return log_error_errno(r, "Failed to make home root directory: %m");
2679 r = mkdir_safe(home, 0755, uid, gid);
2680 if (r < 0 && r != -EEXIST)
2681 return log_error_errno(r, "Failed to make home directory: %m");
2683 fchown(STDIN_FILENO, uid, gid);
2684 fchown(STDOUT_FILENO, uid, gid);
2685 fchown(STDERR_FILENO, uid, gid);
2687 if (setgroups(n_uids, uids) < 0)
2688 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2690 if (setresgid(gid, gid, gid) < 0)
2691 return log_error_errno(errno, "setregid() failed: %m");
2693 if (setresuid(uid, uid, uid) < 0)
2694 return log_error_errno(errno, "setreuid() failed: %m");
2706 * < 0 : wait_for_terminate() failed to get the state of the
2707 * container, the container was terminated by a signal, or
2708 * failed for an unknown reason. No change is made to the
2709 * container argument.
2710 * > 0 : The program executed in the container terminated with an
2711 * error. The exit code of the program executed in the
2712 * container is returned. The container argument has been set
2713 * to CONTAINER_TERMINATED.
2714 * 0 : The container is being rebooted, has been shut down or exited
2715 * successfully. The container argument has been set to either
2716 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2718 * That is, success is indicated by a return value of zero, and an
2719 * error is indicated by a non-zero value.
2721 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2725 r = wait_for_terminate(pid, &status);
2727 return log_warning_errno(r, "Failed to wait for container: %m");
2729 switch (status.si_code) {
2732 if (status.si_status == 0) {
2733 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2736 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2738 *container = CONTAINER_TERMINATED;
2739 return status.si_status;
2742 if (status.si_status == SIGINT) {
2744 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2745 *container = CONTAINER_TERMINATED;
2748 } else if (status.si_status == SIGHUP) {
2750 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2751 *container = CONTAINER_REBOOTED;
2755 /* CLD_KILLED fallthrough */
2758 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2762 log_error("Container %s failed due to unknown reason.", arg_machine);
2769 static void nop_handler(int sig) {}
2771 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2774 pid = PTR_TO_UINT32(userdata);
2776 if (kill(pid, SIGRTMIN+3) >= 0) {
2777 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2778 sd_event_source_set_userdata(s, NULL);
2783 sd_event_exit(sd_event_source_get_event(s), 0);
2787 int main(int argc, char *argv[]) {
2789 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2790 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2791 _cleanup_close_ int master = -1, image_fd = -1;
2792 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2793 _cleanup_fdset_free_ FDSet *fds = NULL;
2794 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2795 const char *console = NULL;
2796 char veth_name[IFNAMSIZ];
2797 bool secondary = false;
2798 sigset_t mask, mask_chld;
2801 log_parse_environment();
2804 k = parse_argv(argc, argv);
2813 if (arg_directory) {
2816 p = path_make_absolute_cwd(arg_directory);
2817 free(arg_directory);
2820 arg_directory = get_current_dir_name();
2822 if (!arg_directory) {
2823 log_error("Failed to determine path, please use -D.");
2826 path_kill_slashes(arg_directory);
2830 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2836 hostname_cleanup(arg_machine, false);
2837 if (isempty(arg_machine)) {
2838 log_error("Failed to determine machine name automatically, please use -M.");
2843 if (geteuid() != 0) {
2844 log_error("Need to be root.");
2848 if (sd_booted() <= 0) {
2849 log_error("Not running on a systemd system.");
2854 n_fd_passed = sd_listen_fds(false);
2855 if (n_fd_passed > 0) {
2856 k = fdset_new_listen_fds(&fds, false);
2858 log_error_errno(k, "Failed to collect file descriptors: %m");
2862 fdset_close_others(fds);
2865 if (arg_directory) {
2866 if (path_equal(arg_directory, "/")) {
2867 log_error("Spawning container on root directory not supported.");
2872 if (path_is_os_tree(arg_directory) <= 0) {
2873 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2879 p = strappenda(arg_directory,
2880 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2881 if (access(p, F_OK) < 0) {
2882 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2888 char template[] = "/tmp/nspawn-root-XXXXXX";
2890 if (!mkdtemp(template)) {
2891 log_error_errno(errno, "Failed to create temporary directory: %m");
2896 arg_directory = strdup(template);
2897 if (!arg_directory) {
2902 image_fd = setup_image(&device_path, &loop_nr);
2908 r = dissect_image(image_fd,
2909 &root_device, &root_device_rw,
2910 &home_device, &home_device_rw,
2911 &srv_device, &srv_device_rw,
2917 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2919 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
2923 console = ptsname(master);
2925 log_error_errno(errno, "Failed to determine tty name: %m");
2930 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2931 arg_machine, arg_image ? arg_image : arg_directory);
2933 if (unlockpt(master) < 0) {
2934 log_error_errno(errno, "Failed to unlock tty: %m");
2938 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2939 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
2945 "STATUS=Container running.");
2947 assert_se(sigemptyset(&mask) == 0);
2948 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2949 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2951 assert_se(sigemptyset(&mask_chld) == 0);
2952 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
2955 ContainerStatus container_status;
2956 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
2957 struct sigaction sa = {
2958 .sa_handler = nop_handler,
2959 .sa_flags = SA_NOCLDSTOP,
2962 r = barrier_create(&barrier);
2964 log_error_errno(r, "Cannot initialize IPC barrier: %m");
2968 /* Child can be killed before execv(), so handle SIGCHLD
2969 * in order to interrupt parent's blocking calls and
2970 * give it a chance to call wait() and terminate. */
2971 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2973 log_error_errno(errno, "Failed to change the signal mask: %m");
2977 r = sigaction(SIGCHLD, &sa, NULL);
2979 log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
2983 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
2984 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2985 (arg_private_network ? CLONE_NEWNET : 0), NULL);
2987 if (errno == EINVAL)
2988 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2990 log_error_errno(errno, "clone() failed: %m");
2998 _cleanup_free_ char *home = NULL;
3000 const char *envp[] = {
3001 "PATH=" DEFAULT_PATH_SPLIT_USR,
3002 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3007 NULL, /* container_uuid */
3008 NULL, /* LISTEN_FDS */
3009 NULL, /* LISTEN_PID */
3014 barrier_set_role(&barrier, BARRIER_CHILD);
3016 envp[n_env] = strv_find_prefix(environ, "TERM=");
3020 master = safe_close(master);
3022 close_nointr(STDIN_FILENO);
3023 close_nointr(STDOUT_FILENO);
3024 close_nointr(STDERR_FILENO);
3026 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3028 reset_all_signal_handlers();
3029 reset_signal_mask();
3031 k = open_terminal(console, O_RDWR);
3032 if (k != STDIN_FILENO) {
3038 log_error_errno(k, "Failed to open console: %m");
3039 _exit(EXIT_FAILURE);
3042 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3043 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3044 log_error_errno(errno, "Failed to duplicate console: %m");
3045 _exit(EXIT_FAILURE);
3049 log_error_errno(errno, "setsid() failed: %m");
3050 _exit(EXIT_FAILURE);
3053 if (reset_audit_loginuid() < 0)
3054 _exit(EXIT_FAILURE);
3056 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3057 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3058 _exit(EXIT_FAILURE);
3061 /* Mark everything as slave, so that we still
3062 * receive mounts from the real root, but don't
3063 * propagate mounts to the real root. */
3064 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3065 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3066 _exit(EXIT_FAILURE);
3069 if (mount_devices(arg_directory,
3070 root_device, root_device_rw,
3071 home_device, home_device_rw,
3072 srv_device, srv_device_rw) < 0)
3073 _exit(EXIT_FAILURE);
3075 /* Turn directory into bind mount */
3076 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3077 log_error_errno(errno, "Failed to make bind mount: %m");
3078 _exit(EXIT_FAILURE);
3081 r = setup_volatile(arg_directory);
3083 _exit(EXIT_FAILURE);
3085 if (setup_volatile_state(arg_directory) < 0)
3086 _exit(EXIT_FAILURE);
3088 r = base_filesystem_create(arg_directory);
3090 _exit(EXIT_FAILURE);
3092 if (arg_read_only) {
3093 k = bind_remount_recursive(arg_directory, true);
3095 log_error_errno(k, "Failed to make tree read-only: %m");
3096 _exit(EXIT_FAILURE);
3100 if (mount_all(arg_directory) < 0)
3101 _exit(EXIT_FAILURE);
3103 if (copy_devnodes(arg_directory) < 0)
3104 _exit(EXIT_FAILURE);
3106 if (setup_ptmx(arg_directory) < 0)
3107 _exit(EXIT_FAILURE);
3109 dev_setup(arg_directory);
3111 if (setup_seccomp() < 0)
3112 _exit(EXIT_FAILURE);
3114 if (setup_dev_console(arg_directory, console) < 0)
3115 _exit(EXIT_FAILURE);
3117 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3118 _exit(EXIT_FAILURE);
3120 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3122 if (setup_boot_id(arg_directory) < 0)
3123 _exit(EXIT_FAILURE);
3125 if (setup_timezone(arg_directory) < 0)
3126 _exit(EXIT_FAILURE);
3128 if (setup_resolv_conf(arg_directory) < 0)
3129 _exit(EXIT_FAILURE);
3131 if (setup_journal(arg_directory) < 0)
3132 _exit(EXIT_FAILURE);
3134 if (mount_binds(arg_directory, arg_bind, false) < 0)
3135 _exit(EXIT_FAILURE);
3137 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3138 _exit(EXIT_FAILURE);
3140 if (mount_tmpfs(arg_directory) < 0)
3141 _exit(EXIT_FAILURE);
3143 /* Tell the parent that we are ready, and that
3144 * it can cgroupify us to that we lack access
3145 * to certain devices and resources. */
3146 (void)barrier_place(&barrier);
3148 if (chdir(arg_directory) < 0) {
3149 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3150 _exit(EXIT_FAILURE);
3153 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3154 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3155 _exit(EXIT_FAILURE);
3158 if (chroot(".") < 0) {
3159 log_error_errno(errno, "chroot() failed: %m");
3160 _exit(EXIT_FAILURE);
3163 if (chdir("/") < 0) {
3164 log_error_errno(errno, "chdir() failed: %m");
3165 _exit(EXIT_FAILURE);
3170 if (arg_private_network)
3173 if (drop_capabilities() < 0) {
3174 log_error_errno(errno, "drop_capabilities() failed: %m");
3175 _exit(EXIT_FAILURE);
3178 r = change_uid_gid(&home);
3180 _exit(EXIT_FAILURE);
3182 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3183 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3184 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3186 _exit(EXIT_FAILURE);
3189 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3192 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3194 _exit(EXIT_FAILURE);
3198 if (fdset_size(fds) > 0) {
3199 k = fdset_cloexec(fds, false);
3201 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3202 _exit(EXIT_FAILURE);
3205 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3206 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3208 _exit(EXIT_FAILURE);
3214 if (arg_personality != 0xffffffffLU) {
3215 if (personality(arg_personality) < 0) {
3216 log_error_errno(errno, "personality() failed: %m");
3217 _exit(EXIT_FAILURE);
3219 } else if (secondary) {
3220 if (personality(PER_LINUX32) < 0) {
3221 log_error_errno(errno, "personality() failed: %m");
3222 _exit(EXIT_FAILURE);
3227 if (arg_selinux_context)
3228 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3229 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3230 _exit(EXIT_FAILURE);
3234 if (!strv_isempty(arg_setenv)) {
3237 n = strv_env_merge(2, envp, arg_setenv);
3240 _exit(EXIT_FAILURE);
3245 env_use = (char**) envp;
3247 /* Wait until the parent is ready with the setup, too... */
3248 if (!barrier_place_and_sync(&barrier))
3249 _exit(EXIT_FAILURE);
3255 /* Automatically search for the init system */
3257 l = 1 + argc - optind;
3258 a = newa(char*, l + 1);
3259 memcpy(a + 1, argv + optind, l * sizeof(char*));
3261 a[0] = (char*) "/usr/lib/systemd/systemd";
3262 execve(a[0], a, env_use);
3264 a[0] = (char*) "/lib/systemd/systemd";
3265 execve(a[0], a, env_use);
3267 a[0] = (char*) "/sbin/init";
3268 execve(a[0], a, env_use);
3269 } else if (argc > optind)
3270 execvpe(argv[optind], argv + optind, env_use);
3272 chdir(home ? home : "/root");
3273 execle("/bin/bash", "-bash", NULL, env_use);
3274 execle("/bin/sh", "-sh", NULL, env_use);
3277 log_error_errno(errno, "execv() failed: %m");
3278 _exit(EXIT_FAILURE);
3281 barrier_set_role(&barrier, BARRIER_PARENT);
3285 /* wait for child-setup to be done */
3286 if (barrier_place_and_sync(&barrier)) {
3287 _cleanup_event_unref_ sd_event *event = NULL;
3288 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3291 r = move_network_interfaces(pid);
3295 r = setup_veth(pid, veth_name, &ifi);
3299 r = setup_bridge(veth_name, &ifi);
3303 r = setup_macvlan(pid);
3307 r = register_machine(pid, ifi);
3311 /* Block SIGCHLD here, before notifying child.
3312 * process_pty() will handle it with the other signals. */
3313 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3317 /* Reset signal to default */
3318 r = default_signals(SIGCHLD, -1);
3322 /* Notify the child that the parent is ready with all
3323 * its setup, and that the child can now hand over
3324 * control to the code to run inside the container. */
3325 (void)barrier_place(&barrier);
3327 r = sd_event_new(&event);
3329 log_error_errno(r, "Failed to get default event source: %m");
3334 /* Try to kill the init system on SIGINT or SIGTERM */
3335 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3336 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3338 /* Immediately exit */
3339 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3340 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3343 /* simply exit on sigchld */
3344 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3346 r = pty_forward_new(event, master, &forward);
3348 log_error_errno(r, "Failed to create PTY forwarder: %m");
3352 r = sd_event_loop(event);
3354 return log_error_errno(r, "Failed to run event loop: %m");
3356 forward = pty_forward_free(forward);
3361 /* Kill if it is not dead yet anyway */
3362 terminate_machine(pid);
3365 /* Normally redundant, but better safe than sorry */
3368 r = wait_for_container(pid, &container_status);
3372 /* We failed to wait for the container, or the
3373 * container exited abnormally */
3376 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3377 /* The container exited with a non-zero
3378 * status, or with zero status and no reboot
3382 /* CONTAINER_REBOOTED, loop again */
3384 if (arg_keep_unit) {
3385 /* Special handling if we are running as a
3386 * service: instead of simply restarting the
3387 * machine we want to restart the entire
3388 * service, so let's inform systemd about this
3389 * with the special exit code 133. The service
3390 * file uses RestartForceExitStatus=133 so
3391 * that this results in a full nspawn
3392 * restart. This is necessary since we might
3393 * have cgroup parameters set we want to have
3403 "STATUS=Terminating...");
3405 loop_remove(loop_nr, &image_fd);
3410 free(arg_directory);
3413 strv_free(arg_setenv);
3414 strv_free(arg_network_interfaces);
3415 strv_free(arg_network_macvlan);
3416 strv_free(arg_bind);
3417 strv_free(arg_bind_ro);
3418 strv_free(arg_tmpfs);