1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
49 #include <selinux/selinux.h>
57 #include <blkid/blkid.h>
60 #include "sd-daemon.h"
70 #include "cgroup-util.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
79 #include "bus-error.h"
81 #include "bus-kernel.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
88 #include "siphash24.h"
90 #include "base-filesystem.h"
92 #include "event-util.h"
96 #include "seccomp-util.h"
99 typedef enum ContainerStatus {
100 CONTAINER_TERMINATED,
104 typedef enum LinkJournal {
111 typedef enum Volatile {
117 static char *arg_directory = NULL;
118 static char *arg_user = NULL;
119 static sd_id128_t arg_uuid = {};
120 static char *arg_machine = NULL;
121 static const char *arg_selinux_context = NULL;
122 static const char *arg_selinux_apifs_context = NULL;
123 static const char *arg_slice = NULL;
124 static bool arg_private_network = false;
125 static bool arg_read_only = false;
126 static bool arg_boot = false;
127 static LinkJournal arg_link_journal = LINK_AUTO;
128 static bool arg_link_journal_try = false;
129 static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
151 (1ULL << CAP_SYS_RESOURCE) |
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
154 (1ULL << CAP_AUDIT_CONTROL) |
156 static char **arg_bind = NULL;
157 static char **arg_bind_ro = NULL;
158 static char **arg_tmpfs = NULL;
159 static char **arg_setenv = NULL;
160 static bool arg_quiet = false;
161 static bool arg_share_system = false;
162 static bool arg_register = true;
163 static bool arg_keep_unit = false;
164 static char **arg_network_interfaces = NULL;
165 static char **arg_network_macvlan = NULL;
166 static bool arg_network_veth = false;
167 static const char *arg_network_bridge = NULL;
168 static unsigned long arg_personality = 0xffffffffLU;
169 static const char *arg_image = NULL;
170 static Volatile arg_volatile = VOLATILE_NO;
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " -i --image=PATH File system device or image for the container\n"
180 " -b --boot Boot up full system (i.e. invoke init)\n"
181 " -u --user=USER Run the command under specified user or uid\n"
182 " -M --machine=NAME Set the machine name for the container\n"
183 " --uuid=UUID Set a specific machine UUID for the container\n"
184 " -S --slice=SLICE Place the container in the specified slice\n"
185 " --private-network Disable network in container\n"
186 " --network-interface=INTERFACE\n"
187 " Assign an existing network interface to the\n"
189 " --network-macvlan=INTERFACE\n"
190 " Create a macvlan network interface based on an\n"
191 " existing network interface to the container\n"
192 " --network-veth Add a virtual ethernet connection between host\n"
194 " --network-bridge=INTERFACE\n"
195 " Add a virtual ethernet connection between host\n"
196 " and container and add it to an existing bridge on\n"
198 " -Z --selinux-context=SECLABEL\n"
199 " Set the SELinux security context to be used by\n"
200 " processes in the container\n"
201 " -L --selinux-apifs-context=SECLABEL\n"
202 " Set the SELinux security context to be used by\n"
203 " API/tmpfs file systems in the container\n"
204 " --capability=CAP In addition to the default, retain specified\n"
206 " --drop-capability=CAP Drop the specified capability from the default set\n"
207 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
208 " try-guest, try-host\n"
209 " -j Equivalent to --link-journal=try-guest\n"
210 " --read-only Mount the root directory read-only\n"
211 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
213 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
214 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
215 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
216 " --share-system Share system namespaces with host\n"
217 " --register=BOOLEAN Register container as machine\n"
218 " --keep-unit Do not register a scope for the machine, reuse\n"
219 " the service unit nspawn is running in\n"
220 " --volatile[=MODE] Run the system in volatile mode\n",
221 program_invocation_short_name);
224 static int parse_argv(int argc, char *argv[]) {
241 ARG_NETWORK_INTERFACE,
249 static const struct option options[] = {
250 { "help", no_argument, NULL, 'h' },
251 { "version", no_argument, NULL, ARG_VERSION },
252 { "directory", required_argument, NULL, 'D' },
253 { "user", required_argument, NULL, 'u' },
254 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
255 { "boot", no_argument, NULL, 'b' },
256 { "uuid", required_argument, NULL, ARG_UUID },
257 { "read-only", no_argument, NULL, ARG_READ_ONLY },
258 { "capability", required_argument, NULL, ARG_CAPABILITY },
259 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
260 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
261 { "bind", required_argument, NULL, ARG_BIND },
262 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
263 { "tmpfs", required_argument, NULL, ARG_TMPFS },
264 { "machine", required_argument, NULL, 'M' },
265 { "slice", required_argument, NULL, 'S' },
266 { "setenv", required_argument, NULL, ARG_SETENV },
267 { "selinux-context", required_argument, NULL, 'Z' },
268 { "selinux-apifs-context", required_argument, NULL, 'L' },
269 { "quiet", no_argument, NULL, 'q' },
270 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
271 { "register", required_argument, NULL, ARG_REGISTER },
272 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
273 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
274 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
275 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
276 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
277 { "personality", required_argument, NULL, ARG_PERSONALITY },
278 { "image", required_argument, NULL, 'i' },
279 { "volatile", optional_argument, NULL, ARG_VOLATILE },
284 uint64_t plus = 0, minus = 0;
289 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
298 puts(PACKAGE_STRING);
299 puts(SYSTEMD_FEATURES);
304 arg_directory = canonicalize_file_name(optarg);
305 if (!arg_directory) {
306 log_error_errno(errno, "Invalid root directory: %m");
318 arg_user = strdup(optarg);
324 case ARG_NETWORK_BRIDGE:
325 arg_network_bridge = optarg;
329 case ARG_NETWORK_VETH:
330 arg_network_veth = true;
331 arg_private_network = true;
334 case ARG_NETWORK_INTERFACE:
335 if (strv_extend(&arg_network_interfaces, optarg) < 0)
338 arg_private_network = true;
341 case ARG_NETWORK_MACVLAN:
342 if (strv_extend(&arg_network_macvlan, optarg) < 0)
347 case ARG_PRIVATE_NETWORK:
348 arg_private_network = true;
356 r = sd_id128_from_string(optarg, &arg_uuid);
358 log_error("Invalid UUID: %s", optarg);
368 if (isempty(optarg)) {
373 if (!hostname_is_valid(optarg)) {
374 log_error("Invalid machine name: %s", optarg);
379 arg_machine = strdup(optarg);
387 arg_selinux_context = optarg;
391 arg_selinux_apifs_context = optarg;
395 arg_read_only = true;
399 case ARG_DROP_CAPABILITY: {
400 const char *state, *word;
403 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
404 _cleanup_free_ char *t;
406 t = strndup(word, length);
410 if (streq(t, "all")) {
411 if (c == ARG_CAPABILITY)
412 plus = (uint64_t) -1;
414 minus = (uint64_t) -1;
418 cap = capability_from_name(t);
420 log_error("Failed to parse capability %s.", t);
424 if (c == ARG_CAPABILITY)
425 plus |= 1ULL << (uint64_t) cap;
427 minus |= 1ULL << (uint64_t) cap;
435 arg_link_journal = LINK_GUEST;
436 arg_link_journal_try = true;
439 case ARG_LINK_JOURNAL:
440 if (streq(optarg, "auto"))
441 arg_link_journal = LINK_AUTO;
442 else if (streq(optarg, "no"))
443 arg_link_journal = LINK_NO;
444 else if (streq(optarg, "guest"))
445 arg_link_journal = LINK_GUEST;
446 else if (streq(optarg, "host"))
447 arg_link_journal = LINK_HOST;
448 else if (streq(optarg, "try-guest")) {
449 arg_link_journal = LINK_GUEST;
450 arg_link_journal_try = true;
451 } else if (streq(optarg, "try-host")) {
452 arg_link_journal = LINK_HOST;
453 arg_link_journal_try = true;
455 log_error("Failed to parse link journal mode %s", optarg);
463 _cleanup_free_ char *a = NULL, *b = NULL;
467 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
469 e = strchr(optarg, ':');
471 a = strndup(optarg, e - optarg);
481 if (!path_is_absolute(a) || !path_is_absolute(b)) {
482 log_error("Invalid bind mount specification: %s", optarg);
486 r = strv_extend(x, a);
490 r = strv_extend(x, b);
498 _cleanup_free_ char *a = NULL, *b = NULL;
501 e = strchr(optarg, ':');
503 a = strndup(optarg, e - optarg);
507 b = strdup("mode=0755");
513 if (!path_is_absolute(a)) {
514 log_error("Invalid tmpfs specification: %s", optarg);
518 r = strv_push(&arg_tmpfs, a);
524 r = strv_push(&arg_tmpfs, b);
536 if (!env_assignment_is_valid(optarg)) {
537 log_error("Environment variable assignment '%s' is not valid.", optarg);
541 n = strv_env_set(arg_setenv, optarg);
545 strv_free(arg_setenv);
554 case ARG_SHARE_SYSTEM:
555 arg_share_system = true;
559 r = parse_boolean(optarg);
561 log_error("Failed to parse --register= argument: %s", optarg);
569 arg_keep_unit = true;
572 case ARG_PERSONALITY:
574 arg_personality = personality_from_string(optarg);
575 if (arg_personality == 0xffffffffLU) {
576 log_error("Unknown or unsupported personality '%s'.", optarg);
585 arg_volatile = VOLATILE_YES;
587 r = parse_boolean(optarg);
589 if (streq(optarg, "state"))
590 arg_volatile = VOLATILE_STATE;
592 log_error("Failed to parse --volatile= argument: %s", optarg);
596 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
605 assert_not_reached("Unhandled option");
608 if (arg_share_system)
609 arg_register = false;
611 if (arg_boot && arg_share_system) {
612 log_error("--boot and --share-system may not be combined.");
616 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
617 log_error("--keep-unit may not be used when invoked from a user session.");
621 if (arg_directory && arg_image) {
622 log_error("--directory= and --image= may not be combined.");
626 if (arg_volatile != VOLATILE_NO && arg_read_only) {
627 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
631 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
636 static int mount_all(const char *dest) {
638 typedef struct MountPoint {
647 static const MountPoint mount_table[] = {
648 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
649 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
650 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
651 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
652 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
653 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
654 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
655 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
657 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
658 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
665 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
666 _cleanup_free_ char *where = NULL;
668 _cleanup_free_ char *options = NULL;
673 where = strjoin(dest, "/", mount_table[k].where, NULL);
677 t = path_is_mount_point(where, true);
679 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
687 /* Skip this entry if it is not a remount. */
688 if (mount_table[k].what && t > 0)
691 t = mkdir_p(where, 0755);
693 if (mount_table[k].fatal) {
694 log_error_errno(t, "Failed to create directory %s: %m", where);
699 log_warning_errno(t, "Failed to create directory %s: %m", where);
705 if (arg_selinux_apifs_context &&
706 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
707 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
714 o = mount_table[k].options;
717 if (mount(mount_table[k].what,
720 mount_table[k].flags,
723 if (mount_table[k].fatal) {
724 log_error_errno(errno, "mount(%s) failed: %m", where);
729 log_warning_errno(errno, "mount(%s) failed: %m", where);
736 static int mount_binds(const char *dest, char **l, bool ro) {
739 STRV_FOREACH_PAIR(x, y, l) {
740 _cleanup_free_ char *where = NULL;
741 struct stat source_st, dest_st;
744 if (stat(*x, &source_st) < 0)
745 return log_error_errno(errno, "Failed to stat %s: %m", *x);
747 where = strappend(dest, *y);
751 r = stat(where, &dest_st);
753 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
754 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
757 } else if (errno == ENOENT) {
758 r = mkdir_parents_label(where, 0755);
760 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
762 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
766 /* Create the mount point, but be conservative -- refuse to create block
767 * and char devices. */
768 if (S_ISDIR(source_st.st_mode)) {
769 r = mkdir_label(where, 0755);
770 if (r < 0 && errno != EEXIST)
771 return log_error_errno(r, "Failed to create mount point %s: %m", where);
772 } else if (S_ISFIFO(source_st.st_mode)) {
773 r = mkfifo(where, 0644);
774 if (r < 0 && errno != EEXIST)
775 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
776 } else if (S_ISSOCK(source_st.st_mode)) {
777 r = mknod(where, 0644 | S_IFSOCK, 0);
778 if (r < 0 && errno != EEXIST)
779 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
780 } else if (S_ISREG(source_st.st_mode)) {
783 return log_error_errno(r, "Failed to create mount point %s: %m", where);
785 log_error("Refusing to create mountpoint for file: %s", *x);
789 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
790 return log_error_errno(errno, "mount(%s) failed: %m", where);
793 r = bind_remount_recursive(where, true);
795 return log_error_errno(r, "Read-Only bind mount failed: %m");
802 static int mount_tmpfs(const char *dest) {
805 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
806 _cleanup_free_ char *where = NULL;
809 where = strappend(dest, *i);
813 r = mkdir_label(where, 0755);
814 if (r < 0 && r != -EEXIST)
815 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
817 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
818 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
824 static int setup_timezone(const char *dest) {
825 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
831 /* Fix the timezone, if possible */
832 r = readlink_malloc("/etc/localtime", &p);
834 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
838 z = path_startswith(p, "../usr/share/zoneinfo/");
840 z = path_startswith(p, "/usr/share/zoneinfo/");
842 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
846 where = strappend(dest, "/etc/localtime");
850 r = readlink_malloc(where, &q);
852 y = path_startswith(q, "../usr/share/zoneinfo/");
854 y = path_startswith(q, "/usr/share/zoneinfo/");
856 /* Already pointing to the right place? Then do nothing .. */
857 if (y && streq(y, z))
861 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
865 if (access(check, F_OK) < 0) {
866 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
870 what = strappend("../usr/share/zoneinfo/", z);
874 r = mkdir_parents(where, 0755);
876 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
882 if (r < 0 && errno != ENOENT) {
883 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
888 if (symlink(what, where) < 0) {
889 log_error_errno(errno, "Failed to correct timezone of container: %m");
896 static int setup_resolv_conf(const char *dest) {
897 _cleanup_free_ char *where = NULL;
902 if (arg_private_network)
905 /* Fix resolv.conf, if possible */
906 where = strappend(dest, "/etc/resolv.conf");
910 /* We don't really care for the results of this really. If it
911 * fails, it fails, but meh... */
912 r = mkdir_parents(where, 0755);
914 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
919 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
921 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
929 static int setup_volatile_state(const char *directory) {
935 if (arg_volatile != VOLATILE_STATE)
938 /* --volatile=state means we simply overmount /var
939 with a tmpfs, and the rest read-only. */
941 r = bind_remount_recursive(directory, true);
943 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
945 p = strappenda(directory, "/var");
947 if (r < 0 && errno != EEXIST)
948 return log_error_errno(errno, "Failed to create %s: %m", directory);
950 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
951 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
956 static int setup_volatile(const char *directory) {
957 bool tmpfs_mounted = false, bind_mounted = false;
958 char template[] = "/tmp/nspawn-volatile-XXXXXX";
964 if (arg_volatile != VOLATILE_YES)
967 /* --volatile=yes means we mount a tmpfs to the root dir, and
968 the original /usr to use inside it, and that read-only. */
970 if (!mkdtemp(template))
971 return log_error_errno(errno, "Failed to create temporary directory: %m");
973 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
974 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
979 tmpfs_mounted = true;
981 f = strappenda(directory, "/usr");
982 t = strappenda(template, "/usr");
985 if (r < 0 && errno != EEXIST) {
986 log_error_errno(errno, "Failed to create %s: %m", t);
991 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
992 log_error_errno(errno, "Failed to create /usr bind mount: %m");
999 r = bind_remount_recursive(t, true);
1001 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1005 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1006 log_error_errno(errno, "Failed to move root mount: %m");
1024 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1027 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1028 SD_ID128_FORMAT_VAL(id));
1033 static int setup_boot_id(const char *dest) {
1034 _cleanup_free_ char *from = NULL, *to = NULL;
1035 sd_id128_t rnd = {};
1041 if (arg_share_system)
1044 /* Generate a new randomized boot ID, so that each boot-up of
1045 * the container gets a new one */
1047 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1048 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1052 r = sd_id128_randomize(&rnd);
1054 return log_error_errno(r, "Failed to generate random boot id: %m");
1056 id128_format_as_uuid(rnd, as_uuid);
1058 r = write_string_file(from, as_uuid);
1060 return log_error_errno(r, "Failed to write boot id: %m");
1062 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1063 log_error_errno(errno, "Failed to bind mount boot id: %m");
1065 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1066 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1072 static int copy_devnodes(const char *dest) {
1074 static const char devnodes[] =
1085 _cleanup_umask_ mode_t u;
1091 NULSTR_FOREACH(d, devnodes) {
1092 _cleanup_free_ char *from = NULL, *to = NULL;
1095 from = strappend("/dev/", d);
1096 to = strjoin(dest, "/dev/", d, NULL);
1100 if (stat(from, &st) < 0) {
1102 if (errno != ENOENT)
1103 return log_error_errno(errno, "Failed to stat %s: %m", from);
1105 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1107 log_error("%s is not a char or block device, cannot copy", from);
1111 r = mkdir_parents(to, 0775);
1113 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1117 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1118 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1125 static int setup_ptmx(const char *dest) {
1126 _cleanup_free_ char *p = NULL;
1128 p = strappend(dest, "/dev/ptmx");
1132 if (symlink("pts/ptmx", p) < 0)
1133 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1138 static int setup_dev_console(const char *dest, const char *console) {
1139 _cleanup_umask_ mode_t u;
1149 if (stat("/dev/null", &st) < 0)
1150 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1152 r = chmod_and_chown(console, 0600, 0, 0);
1154 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1156 /* We need to bind mount the right tty to /dev/console since
1157 * ptys can only exist on pts file systems. To have something
1158 * to bind mount things on we create a device node first, and
1159 * use /dev/null for that since we the cgroups device policy
1160 * allows us to create that freely, while we cannot create
1161 * /dev/console. (Note that the major minor doesn't actually
1162 * matter here, since we mount it over anyway). */
1164 to = strappenda(dest, "/dev/console");
1165 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1166 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1168 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1169 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1174 static int setup_kmsg(const char *dest, int kmsg_socket) {
1175 _cleanup_free_ char *from = NULL, *to = NULL;
1177 _cleanup_umask_ mode_t u;
1179 struct cmsghdr cmsghdr;
1180 uint8_t buf[CMSG_SPACE(sizeof(int))];
1182 struct msghdr mh = {
1183 .msg_control = &control,
1184 .msg_controllen = sizeof(control),
1186 struct cmsghdr *cmsg;
1189 assert(kmsg_socket >= 0);
1193 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1194 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1195 * on the reading side behave very similar to /proc/kmsg,
1196 * their writing side behaves differently from /dev/kmsg in
1197 * that writing blocks when nothing is reading. In order to
1198 * avoid any problems with containers deadlocking due to this
1199 * we simply make /dev/kmsg unavailable to the container. */
1200 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1201 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1204 if (mkfifo(from, 0600) < 0)
1205 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1207 r = chmod_and_chown(from, 0600, 0, 0);
1209 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1211 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1212 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1214 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1216 return log_error_errno(errno, "Failed to open fifo: %m");
1218 cmsg = CMSG_FIRSTHDR(&mh);
1219 cmsg->cmsg_level = SOL_SOCKET;
1220 cmsg->cmsg_type = SCM_RIGHTS;
1221 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1222 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1224 mh.msg_controllen = cmsg->cmsg_len;
1226 /* Store away the fd in the socket, so that it stays open as
1227 * long as we run the child */
1228 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1232 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1234 /* And now make the FIFO unavailable as /dev/kmsg... */
1239 static int setup_hostname(void) {
1241 if (arg_share_system)
1244 if (sethostname_idempotent(arg_machine) < 0)
1250 static int setup_journal(const char *directory) {
1251 sd_id128_t machine_id, this_id;
1252 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1256 p = strappend(directory, "/etc/machine-id");
1260 r = read_one_line_file(p, &b);
1261 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1264 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1267 if (isempty(id) && arg_link_journal == LINK_AUTO)
1270 /* Verify validity */
1271 r = sd_id128_from_string(id, &machine_id);
1273 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1275 r = sd_id128_get_machine(&this_id);
1277 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1279 if (sd_id128_equal(machine_id, this_id)) {
1280 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1281 "Host and machine ids are equal (%s): refusing to link journals", id);
1282 if (arg_link_journal == LINK_AUTO)
1288 if (arg_link_journal == LINK_NO)
1292 p = strappend("/var/log/journal/", id);
1293 q = strjoin(directory, "/var/log/journal/", id, NULL);
1297 if (path_is_mount_point(p, false) > 0) {
1298 if (arg_link_journal != LINK_AUTO) {
1299 log_error("%s: already a mount point, refusing to use for journal", p);
1306 if (path_is_mount_point(q, false) > 0) {
1307 if (arg_link_journal != LINK_AUTO) {
1308 log_error("%s: already a mount point, refusing to use for journal", q);
1315 r = readlink_and_make_absolute(p, &d);
1317 if ((arg_link_journal == LINK_GUEST ||
1318 arg_link_journal == LINK_AUTO) &&
1321 r = mkdir_p(q, 0755);
1323 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1328 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1329 } else if (r == -EINVAL) {
1331 if (arg_link_journal == LINK_GUEST &&
1334 if (errno == ENOTDIR) {
1335 log_error("%s already exists and is neither a symlink nor a directory", p);
1338 log_error_errno(errno, "Failed to remove %s: %m", p);
1342 } else if (r != -ENOENT) {
1343 log_error_errno(errno, "readlink(%s) failed: %m", p);
1347 if (arg_link_journal == LINK_GUEST) {
1349 if (symlink(q, p) < 0) {
1350 if (arg_link_journal_try) {
1351 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1354 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1359 r = mkdir_p(q, 0755);
1361 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1365 if (arg_link_journal == LINK_HOST) {
1366 /* don't create parents here -- if the host doesn't have
1367 * permanent journal set up, don't force it here */
1370 if (arg_link_journal_try) {
1371 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1374 log_error_errno(errno, "Failed to create %s: %m", p);
1379 } else if (access(p, F_OK) < 0)
1382 if (dir_is_empty(q) == 0)
1383 log_warning("%s is not empty, proceeding anyway.", q);
1385 r = mkdir_p(q, 0755);
1387 log_error_errno(errno, "Failed to create %s: %m", q);
1391 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1392 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1397 static int drop_capabilities(void) {
1398 return capability_bounding_set_drop(~arg_retain, false);
1401 static int register_machine(pid_t pid, int local_ifindex) {
1402 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1403 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1409 r = sd_bus_default_system(&bus);
1411 return log_error_errno(r, "Failed to open system bus: %m");
1413 if (arg_keep_unit) {
1414 r = sd_bus_call_method(
1416 "org.freedesktop.machine1",
1417 "/org/freedesktop/machine1",
1418 "org.freedesktop.machine1.Manager",
1419 "RegisterMachineWithNetwork",
1424 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1428 strempty(arg_directory),
1429 local_ifindex > 0 ? 1 : 0, local_ifindex);
1431 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1433 r = sd_bus_message_new_method_call(
1436 "org.freedesktop.machine1",
1437 "/org/freedesktop/machine1",
1438 "org.freedesktop.machine1.Manager",
1439 "CreateMachineWithNetwork");
1441 return log_error_errno(r, "Failed to create message: %m");
1443 r = sd_bus_message_append(
1447 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1451 strempty(arg_directory),
1452 local_ifindex > 0 ? 1 : 0, local_ifindex);
1454 return log_error_errno(r, "Failed to append message arguments: %m");
1456 r = sd_bus_message_open_container(m, 'a', "(sv)");
1458 return log_error_errno(r, "Failed to open container: %m");
1460 if (!isempty(arg_slice)) {
1461 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1463 return log_error_errno(r, "Failed to append slice: %m");
1466 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1468 return log_error_errno(r, "Failed to add device policy: %m");
1470 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1471 /* Allow the container to
1472 * access and create the API
1473 * device nodes, so that
1474 * PrivateDevices= in the
1475 * container can work
1480 "/dev/random", "rwm",
1481 "/dev/urandom", "rwm",
1483 "/dev/net/tun", "rwm",
1484 /* Allow the container
1485 * access to ptys. However,
1487 * container to ever create
1488 * these device nodes. */
1489 "/dev/pts/ptmx", "rw",
1492 return log_error_errno(r, "Failed to add device whitelist: %m");
1494 r = sd_bus_message_close_container(m);
1496 return log_error_errno(r, "Failed to close container: %m");
1498 r = sd_bus_call(bus, m, 0, &error, NULL);
1502 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1509 static int terminate_machine(pid_t pid) {
1510 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1511 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1512 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1519 r = sd_bus_default_system(&bus);
1521 return log_error_errno(r, "Failed to open system bus: %m");
1523 r = sd_bus_call_method(
1525 "org.freedesktop.machine1",
1526 "/org/freedesktop/machine1",
1527 "org.freedesktop.machine1.Manager",
1534 /* Note that the machine might already have been
1535 * cleaned up automatically, hence don't consider it a
1536 * failure if we cannot get the machine object. */
1537 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1541 r = sd_bus_message_read(reply, "o", &path);
1543 return bus_log_parse_error(r);
1545 r = sd_bus_call_method(
1547 "org.freedesktop.machine1",
1549 "org.freedesktop.machine1.Machine",
1555 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1562 static int reset_audit_loginuid(void) {
1563 _cleanup_free_ char *p = NULL;
1566 if (arg_share_system)
1569 r = read_one_line_file("/proc/self/loginuid", &p);
1573 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1575 /* Already reset? */
1576 if (streq(p, "4294967295"))
1579 r = write_string_file("/proc/self/loginuid", "4294967295");
1581 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1582 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1583 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1584 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1585 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1593 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1594 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1595 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1597 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1603 l = strlen(arg_machine);
1604 sz = sizeof(sd_id128_t) + l;
1610 /* fetch some persistent data unique to the host */
1611 r = sd_id128_get_machine((sd_id128_t*) v);
1615 /* combine with some data unique (on this host) to this
1616 * container instance */
1617 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1620 memcpy(i, &idx, sizeof(idx));
1623 /* Let's hash the host machine ID plus the container name. We
1624 * use a fixed, but originally randomly created hash key here. */
1625 siphash24(result, v, sz, hash_key.bytes);
1627 assert_cc(ETH_ALEN <= sizeof(result));
1628 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1630 /* see eth_random_addr in the kernel */
1631 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1632 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1637 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1638 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1639 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1640 struct ether_addr mac_host, mac_container;
1643 if (!arg_private_network)
1646 if (!arg_network_veth)
1649 /* Use two different interface name prefixes depending whether
1650 * we are in bridge mode or not. */
1651 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1652 arg_network_bridge ? "vb" : "ve", arg_machine);
1654 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1656 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1658 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1660 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1662 r = sd_rtnl_open(&rtnl, 0);
1664 return log_error_errno(r, "Failed to connect to netlink: %m");
1666 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1668 return log_error_errno(r, "Failed to allocate netlink message: %m");
1670 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1672 return log_error_errno(r, "Failed to add netlink interface name: %m");
1674 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1676 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1678 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1680 return log_error_errno(r, "Failed to open netlink container: %m");
1682 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1684 return log_error_errno(r, "Failed to open netlink container: %m");
1686 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1688 return log_error_errno(r, "Failed to open netlink container: %m");
1690 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1692 return log_error_errno(r, "Failed to add netlink interface name: %m");
1694 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1696 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1698 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1700 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1702 r = sd_rtnl_message_close_container(m);
1704 return log_error_errno(r, "Failed to close netlink container: %m");
1706 r = sd_rtnl_message_close_container(m);
1708 return log_error_errno(r, "Failed to close netlink container: %m");
1710 r = sd_rtnl_message_close_container(m);
1712 return log_error_errno(r, "Failed to close netlink container: %m");
1714 r = sd_rtnl_call(rtnl, m, 0, NULL);
1716 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1718 i = (int) if_nametoindex(iface_name);
1720 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1727 static int setup_bridge(const char veth_name[], int *ifi) {
1728 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1729 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1732 if (!arg_private_network)
1735 if (!arg_network_veth)
1738 if (!arg_network_bridge)
1741 bridge = (int) if_nametoindex(arg_network_bridge);
1743 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1747 r = sd_rtnl_open(&rtnl, 0);
1749 return log_error_errno(r, "Failed to connect to netlink: %m");
1751 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1753 return log_error_errno(r, "Failed to allocate netlink message: %m");
1755 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1757 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1759 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1761 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1763 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1765 return log_error_errno(r, "Failed to add netlink master field: %m");
1767 r = sd_rtnl_call(rtnl, m, 0, NULL);
1769 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1774 static int parse_interface(struct udev *udev, const char *name) {
1775 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1776 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1779 ifi = (int) if_nametoindex(name);
1781 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1783 sprintf(ifi_str, "n%i", ifi);
1784 d = udev_device_new_from_device_id(udev, ifi_str);
1786 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1788 if (udev_device_get_is_initialized(d) <= 0) {
1789 log_error("Network interface %s is not initialized yet.", name);
1796 static int move_network_interfaces(pid_t pid) {
1797 _cleanup_udev_unref_ struct udev *udev = NULL;
1798 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1802 if (!arg_private_network)
1805 if (strv_isempty(arg_network_interfaces))
1808 r = sd_rtnl_open(&rtnl, 0);
1810 return log_error_errno(r, "Failed to connect to netlink: %m");
1814 log_error("Failed to connect to udev.");
1818 STRV_FOREACH(i, arg_network_interfaces) {
1819 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1822 ifi = parse_interface(udev, *i);
1826 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1828 return log_error_errno(r, "Failed to allocate netlink message: %m");
1830 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1832 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1834 r = sd_rtnl_call(rtnl, m, 0, NULL);
1836 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1842 static int setup_macvlan(pid_t pid) {
1843 _cleanup_udev_unref_ struct udev *udev = NULL;
1844 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1849 if (!arg_private_network)
1852 if (strv_isempty(arg_network_macvlan))
1855 r = sd_rtnl_open(&rtnl, 0);
1857 return log_error_errno(r, "Failed to connect to netlink: %m");
1861 log_error("Failed to connect to udev.");
1865 STRV_FOREACH(i, arg_network_macvlan) {
1866 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1867 _cleanup_free_ char *n = NULL;
1868 struct ether_addr mac;
1871 ifi = parse_interface(udev, *i);
1875 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1877 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1879 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1881 return log_error_errno(r, "Failed to allocate netlink message: %m");
1883 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1885 return log_error_errno(r, "Failed to add netlink interface index: %m");
1887 n = strappend("mv-", *i);
1891 strshorten(n, IFNAMSIZ-1);
1893 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1895 return log_error_errno(r, "Failed to add netlink interface name: %m");
1897 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1899 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1901 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1903 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1905 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1907 return log_error_errno(r, "Failed to open netlink container: %m");
1909 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1911 return log_error_errno(r, "Failed to open netlink container: %m");
1913 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1915 return log_error_errno(r, "Failed to append macvlan mode: %m");
1917 r = sd_rtnl_message_close_container(m);
1919 return log_error_errno(r, "Failed to close netlink container: %m");
1921 r = sd_rtnl_message_close_container(m);
1923 return log_error_errno(r, "Failed to close netlink container: %m");
1925 r = sd_rtnl_call(rtnl, m, 0, NULL);
1927 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1933 static int setup_seccomp(void) {
1936 static const int blacklist[] = {
1937 SCMP_SYS(kexec_load),
1938 SCMP_SYS(open_by_handle_at),
1939 SCMP_SYS(init_module),
1940 SCMP_SYS(finit_module),
1941 SCMP_SYS(delete_module),
1948 scmp_filter_ctx seccomp;
1952 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1956 r = seccomp_add_secondary_archs(seccomp);
1958 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1962 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1963 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1965 continue; /* unknown syscall */
1967 log_error_errno(r, "Failed to block syscall: %m");
1973 Audit is broken in containers, much of the userspace audit
1974 hookup will fail if running inside a container. We don't
1975 care and just turn off creation of audit sockets.
1977 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1978 with EAFNOSUPPORT which audit userspace uses as indication
1979 that audit is disabled in the kernel.
1982 r = seccomp_rule_add(
1984 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1987 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1988 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1990 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1994 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1996 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2000 r = seccomp_load(seccomp);
2002 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2005 seccomp_release(seccomp);
2013 static int setup_image(char **device_path, int *loop_nr) {
2014 struct loop_info64 info = {
2015 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2017 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2018 _cleanup_free_ char* loopdev = NULL;
2022 assert(device_path);
2025 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2027 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2029 if (fstat(fd, &st) < 0)
2030 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2032 if (S_ISBLK(st.st_mode)) {
2035 p = strdup(arg_image);
2049 if (!S_ISREG(st.st_mode)) {
2050 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2054 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2056 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2058 nr = ioctl(control, LOOP_CTL_GET_FREE);
2060 return log_error_errno(errno, "Failed to allocate loop device: %m");
2062 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2065 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2067 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2069 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2070 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2073 info.lo_flags |= LO_FLAGS_READ_ONLY;
2075 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2076 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2078 *device_path = loopdev;
2089 static int dissect_image(
2091 char **root_device, bool *root_device_rw,
2092 char **home_device, bool *home_device_rw,
2093 char **srv_device, bool *srv_device_rw,
2097 int home_nr = -1, srv_nr = -1;
2098 #ifdef GPT_ROOT_NATIVE
2101 #ifdef GPT_ROOT_SECONDARY
2102 int secondary_root_nr = -1;
2105 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2106 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2107 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2108 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2109 _cleanup_udev_unref_ struct udev *udev = NULL;
2110 struct udev_list_entry *first, *item;
2111 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2112 const char *pttype = NULL;
2118 assert(root_device);
2119 assert(home_device);
2123 b = blkid_new_probe();
2128 r = blkid_probe_set_device(b, fd, 0, 0);
2133 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2137 blkid_probe_enable_partitions(b, 1);
2138 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2141 r = blkid_do_safeprobe(b);
2142 if (r == -2 || r == 1) {
2143 log_error("Failed to identify any partition table on %s.\n"
2144 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2146 } else if (r != 0) {
2149 log_error_errno(errno, "Failed to probe: %m");
2153 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2154 if (!streq_ptr(pttype, "gpt")) {
2155 log_error("Image %s does not carry a GUID Partition Table.\n"
2156 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2161 pl = blkid_probe_get_partitions(b);
2166 log_error("Failed to list partitions of %s", arg_image);
2174 if (fstat(fd, &st) < 0)
2175 return log_error_errno(errno, "Failed to stat block device: %m");
2177 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2181 e = udev_enumerate_new(udev);
2185 r = udev_enumerate_add_match_parent(e, d);
2189 r = udev_enumerate_scan_devices(e);
2191 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2193 first = udev_enumerate_get_list_entry(e);
2194 udev_list_entry_foreach(item, first) {
2195 _cleanup_udev_device_unref_ struct udev_device *q;
2196 const char *stype, *node;
2197 unsigned long long flags;
2204 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2209 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2213 qn = udev_device_get_devnum(q);
2217 if (st.st_rdev == qn)
2220 node = udev_device_get_devnode(q);
2224 pp = blkid_partlist_devno_to_partition(pl, qn);
2228 flags = blkid_partition_get_flags(pp);
2229 if (flags & GPT_FLAG_NO_AUTO)
2232 nr = blkid_partition_get_partno(pp);
2236 stype = blkid_partition_get_type_string(pp);
2240 if (sd_id128_from_string(stype, &type_id) < 0)
2243 if (sd_id128_equal(type_id, GPT_HOME)) {
2245 if (home && nr >= home_nr)
2249 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2252 home = strdup(node);
2255 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2257 if (srv && nr >= srv_nr)
2261 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2268 #ifdef GPT_ROOT_NATIVE
2269 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2271 if (root && nr >= root_nr)
2275 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2278 root = strdup(node);
2283 #ifdef GPT_ROOT_SECONDARY
2284 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2286 if (secondary_root && nr >= secondary_root_nr)
2289 secondary_root_nr = nr;
2290 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2293 free(secondary_root);
2294 secondary_root = strdup(node);
2295 if (!secondary_root)
2301 if (!root && !secondary_root) {
2302 log_error("Failed to identify root partition in disk image %s.\n"
2303 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2308 *root_device = root;
2311 *root_device_rw = root_rw;
2313 } else if (secondary_root) {
2314 *root_device = secondary_root;
2315 secondary_root = NULL;
2317 *root_device_rw = secondary_root_rw;
2322 *home_device = home;
2325 *home_device_rw = home_rw;
2332 *srv_device_rw = srv_rw;
2337 log_error("--image= is not supported, compiled without blkid support.");
2342 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2344 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2345 const char *fstype, *p;
2355 p = strappenda(where, directory);
2360 b = blkid_new_probe_from_filename(what);
2364 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2368 blkid_probe_enable_superblocks(b, 1);
2369 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2372 r = blkid_do_safeprobe(b);
2373 if (r == -1 || r == 1) {
2374 log_error("Cannot determine file system type of %s", what);
2376 } else if (r != 0) {
2379 log_error_errno(errno, "Failed to probe %s: %m", what);
2384 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2387 log_error("Failed to determine file system type of %s", what);
2391 if (streq(fstype, "crypto_LUKS")) {
2392 log_error("nspawn currently does not support LUKS disk images.");
2396 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2397 return log_error_errno(errno, "Failed to mount %s: %m", what);
2401 log_error("--image= is not supported, compiled without blkid support.");
2406 static int mount_devices(
2408 const char *root_device, bool root_device_rw,
2409 const char *home_device, bool home_device_rw,
2410 const char *srv_device, bool srv_device_rw) {
2416 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2418 return log_error_errno(r, "Failed to mount root directory: %m");
2422 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2424 return log_error_errno(r, "Failed to mount home directory: %m");
2428 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2430 return log_error_errno(r, "Failed to mount server data directory: %m");
2436 static void loop_remove(int nr, int *image_fd) {
2437 _cleanup_close_ int control = -1;
2443 if (image_fd && *image_fd >= 0) {
2444 r = ioctl(*image_fd, LOOP_CLR_FD);
2446 log_warning_errno(errno, "Failed to close loop image: %m");
2447 *image_fd = safe_close(*image_fd);
2450 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2452 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2456 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2458 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2461 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2469 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2470 return log_error_errno(errno, "Failed to allocate pipe: %m");
2474 return log_error_errno(errno, "Failed to fork getent child: %m");
2475 else if (pid == 0) {
2477 char *empty_env = NULL;
2479 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2480 _exit(EXIT_FAILURE);
2482 if (pipe_fds[0] > 2)
2483 safe_close(pipe_fds[0]);
2484 if (pipe_fds[1] > 2)
2485 safe_close(pipe_fds[1]);
2487 nullfd = open("/dev/null", O_RDWR);
2489 _exit(EXIT_FAILURE);
2491 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2492 _exit(EXIT_FAILURE);
2494 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2495 _exit(EXIT_FAILURE);
2500 reset_all_signal_handlers();
2501 close_all_fds(NULL, 0);
2503 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2504 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2505 _exit(EXIT_FAILURE);
2508 pipe_fds[1] = safe_close(pipe_fds[1]);
2515 static int change_uid_gid(char **_home) {
2516 char line[LINE_MAX], *x, *u, *g, *h;
2517 const char *word, *state;
2518 _cleanup_free_ uid_t *uids = NULL;
2519 _cleanup_free_ char *home = NULL;
2520 _cleanup_fclose_ FILE *f = NULL;
2521 _cleanup_close_ int fd = -1;
2522 unsigned n_uids = 0;
2531 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2532 /* Reset everything fully to 0, just in case */
2534 if (setgroups(0, NULL) < 0)
2535 return log_error_errno(errno, "setgroups() failed: %m");
2537 if (setresgid(0, 0, 0) < 0)
2538 return log_error_errno(errno, "setregid() failed: %m");
2540 if (setresuid(0, 0, 0) < 0)
2541 return log_error_errno(errno, "setreuid() failed: %m");
2547 /* First, get user credentials */
2548 fd = spawn_getent("passwd", arg_user, &pid);
2552 f = fdopen(fd, "r");
2557 if (!fgets(line, sizeof(line), f)) {
2560 log_error("Failed to resolve user %s.", arg_user);
2564 log_error_errno(errno, "Failed to read from getent: %m");
2570 wait_for_terminate_and_warn("getent passwd", pid, true);
2572 x = strchr(line, ':');
2574 log_error("/etc/passwd entry has invalid user field.");
2578 u = strchr(x+1, ':');
2580 log_error("/etc/passwd entry has invalid password field.");
2587 log_error("/etc/passwd entry has invalid UID field.");
2595 log_error("/etc/passwd entry has invalid GID field.");
2600 h = strchr(x+1, ':');
2602 log_error("/etc/passwd entry has invalid GECOS field.");
2609 log_error("/etc/passwd entry has invalid home directory field.");
2615 r = parse_uid(u, &uid);
2617 log_error("Failed to parse UID of user.");
2621 r = parse_gid(g, &gid);
2623 log_error("Failed to parse GID of user.");
2631 /* Second, get group memberships */
2632 fd = spawn_getent("initgroups", arg_user, &pid);
2637 f = fdopen(fd, "r");
2642 if (!fgets(line, sizeof(line), f)) {
2644 log_error("Failed to resolve user %s.", arg_user);
2648 log_error_errno(errno, "Failed to read from getent: %m");
2654 wait_for_terminate_and_warn("getent initgroups", pid, true);
2656 /* Skip over the username and subsequent separator whitespace */
2658 x += strcspn(x, WHITESPACE);
2659 x += strspn(x, WHITESPACE);
2661 FOREACH_WORD(word, l, x, state) {
2667 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2670 r = parse_uid(c, &uids[n_uids++]);
2672 log_error("Failed to parse group data from getent.");
2677 r = mkdir_parents(home, 0775);
2679 return log_error_errno(r, "Failed to make home root directory: %m");
2681 r = mkdir_safe(home, 0755, uid, gid);
2682 if (r < 0 && r != -EEXIST)
2683 return log_error_errno(r, "Failed to make home directory: %m");
2685 fchown(STDIN_FILENO, uid, gid);
2686 fchown(STDOUT_FILENO, uid, gid);
2687 fchown(STDERR_FILENO, uid, gid);
2689 if (setgroups(n_uids, uids) < 0)
2690 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2692 if (setresgid(gid, gid, gid) < 0)
2693 return log_error_errno(errno, "setregid() failed: %m");
2695 if (setresuid(uid, uid, uid) < 0)
2696 return log_error_errno(errno, "setreuid() failed: %m");
2708 * < 0 : wait_for_terminate() failed to get the state of the
2709 * container, the container was terminated by a signal, or
2710 * failed for an unknown reason. No change is made to the
2711 * container argument.
2712 * > 0 : The program executed in the container terminated with an
2713 * error. The exit code of the program executed in the
2714 * container is returned. The container argument has been set
2715 * to CONTAINER_TERMINATED.
2716 * 0 : The container is being rebooted, has been shut down or exited
2717 * successfully. The container argument has been set to either
2718 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2720 * That is, success is indicated by a return value of zero, and an
2721 * error is indicated by a non-zero value.
2723 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2727 r = wait_for_terminate(pid, &status);
2729 return log_warning_errno(r, "Failed to wait for container: %m");
2731 switch (status.si_code) {
2734 if (status.si_status == 0) {
2735 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2738 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2740 *container = CONTAINER_TERMINATED;
2741 return status.si_status;
2744 if (status.si_status == SIGINT) {
2746 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2747 *container = CONTAINER_TERMINATED;
2750 } else if (status.si_status == SIGHUP) {
2752 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2753 *container = CONTAINER_REBOOTED;
2757 /* CLD_KILLED fallthrough */
2760 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2764 log_error("Container %s failed due to unknown reason.", arg_machine);
2771 static void nop_handler(int sig) {}
2773 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2776 pid = PTR_TO_UINT32(userdata);
2778 if (kill(pid, SIGRTMIN+3) >= 0) {
2779 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2780 sd_event_source_set_userdata(s, NULL);
2785 sd_event_exit(sd_event_source_get_event(s), 0);
2789 int main(int argc, char *argv[]) {
2791 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2792 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2793 _cleanup_close_ int master = -1, image_fd = -1;
2794 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2795 _cleanup_fdset_free_ FDSet *fds = NULL;
2796 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2797 const char *console = NULL;
2798 char veth_name[IFNAMSIZ];
2799 bool secondary = false;
2800 sigset_t mask, mask_chld;
2803 log_parse_environment();
2806 k = parse_argv(argc, argv);
2815 if (arg_directory) {
2818 p = path_make_absolute_cwd(arg_directory);
2819 free(arg_directory);
2822 arg_directory = get_current_dir_name();
2824 if (!arg_directory) {
2825 log_error("Failed to determine path, please use -D.");
2828 path_kill_slashes(arg_directory);
2832 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2838 hostname_cleanup(arg_machine, false);
2839 if (isempty(arg_machine)) {
2840 log_error("Failed to determine machine name automatically, please use -M.");
2845 if (geteuid() != 0) {
2846 log_error("Need to be root.");
2850 if (sd_booted() <= 0) {
2851 log_error("Not running on a systemd system.");
2856 n_fd_passed = sd_listen_fds(false);
2857 if (n_fd_passed > 0) {
2858 k = fdset_new_listen_fds(&fds, false);
2860 log_error_errno(k, "Failed to collect file descriptors: %m");
2864 fdset_close_others(fds);
2867 if (arg_directory) {
2868 if (path_equal(arg_directory, "/")) {
2869 log_error("Spawning container on root directory not supported.");
2874 if (path_is_os_tree(arg_directory) <= 0) {
2875 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2881 p = strappenda(arg_directory,
2882 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2883 if (access(p, F_OK) < 0) {
2884 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2890 char template[] = "/tmp/nspawn-root-XXXXXX";
2892 if (!mkdtemp(template)) {
2893 log_error_errno(errno, "Failed to create temporary directory: %m");
2898 arg_directory = strdup(template);
2899 if (!arg_directory) {
2904 image_fd = setup_image(&device_path, &loop_nr);
2910 r = dissect_image(image_fd,
2911 &root_device, &root_device_rw,
2912 &home_device, &home_device_rw,
2913 &srv_device, &srv_device_rw,
2919 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2921 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
2925 console = ptsname(master);
2927 log_error_errno(errno, "Failed to determine tty name: %m");
2932 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2933 arg_machine, arg_image ? arg_image : arg_directory);
2935 if (unlockpt(master) < 0) {
2936 log_error_errno(errno, "Failed to unlock tty: %m");
2940 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2941 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
2947 "STATUS=Container running.");
2949 assert_se(sigemptyset(&mask) == 0);
2950 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2951 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2953 assert_se(sigemptyset(&mask_chld) == 0);
2954 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
2957 ContainerStatus container_status;
2958 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
2959 struct sigaction sa = {
2960 .sa_handler = nop_handler,
2961 .sa_flags = SA_NOCLDSTOP,
2964 r = barrier_create(&barrier);
2966 log_error_errno(r, "Cannot initialize IPC barrier: %m");
2970 /* Child can be killed before execv(), so handle SIGCHLD
2971 * in order to interrupt parent's blocking calls and
2972 * give it a chance to call wait() and terminate. */
2973 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2975 log_error_errno(errno, "Failed to change the signal mask: %m");
2979 r = sigaction(SIGCHLD, &sa, NULL);
2981 log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
2985 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
2986 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2987 (arg_private_network ? CLONE_NEWNET : 0), NULL);
2989 if (errno == EINVAL)
2990 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2992 log_error_errno(errno, "clone() failed: %m");
3000 _cleanup_free_ char *home = NULL;
3002 const char *envp[] = {
3003 "PATH=" DEFAULT_PATH_SPLIT_USR,
3004 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3009 NULL, /* container_uuid */
3010 NULL, /* LISTEN_FDS */
3011 NULL, /* LISTEN_PID */
3016 barrier_set_role(&barrier, BARRIER_CHILD);
3018 envp[n_env] = strv_find_prefix(environ, "TERM=");
3022 master = safe_close(master);
3024 close_nointr(STDIN_FILENO);
3025 close_nointr(STDOUT_FILENO);
3026 close_nointr(STDERR_FILENO);
3028 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3030 reset_all_signal_handlers();
3031 reset_signal_mask();
3033 k = open_terminal(console, O_RDWR);
3034 if (k != STDIN_FILENO) {
3040 log_error_errno(k, "Failed to open console: %m");
3041 _exit(EXIT_FAILURE);
3044 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3045 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3046 log_error_errno(errno, "Failed to duplicate console: %m");
3047 _exit(EXIT_FAILURE);
3051 log_error_errno(errno, "setsid() failed: %m");
3052 _exit(EXIT_FAILURE);
3055 if (reset_audit_loginuid() < 0)
3056 _exit(EXIT_FAILURE);
3058 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3059 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3060 _exit(EXIT_FAILURE);
3063 /* Mark everything as slave, so that we still
3064 * receive mounts from the real root, but don't
3065 * propagate mounts to the real root. */
3066 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3067 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3068 _exit(EXIT_FAILURE);
3071 if (mount_devices(arg_directory,
3072 root_device, root_device_rw,
3073 home_device, home_device_rw,
3074 srv_device, srv_device_rw) < 0)
3075 _exit(EXIT_FAILURE);
3077 /* Turn directory into bind mount */
3078 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3079 log_error_errno(errno, "Failed to make bind mount: %m");
3080 _exit(EXIT_FAILURE);
3083 r = setup_volatile(arg_directory);
3085 _exit(EXIT_FAILURE);
3087 if (setup_volatile_state(arg_directory) < 0)
3088 _exit(EXIT_FAILURE);
3090 r = base_filesystem_create(arg_directory);
3092 _exit(EXIT_FAILURE);
3094 if (arg_read_only) {
3095 k = bind_remount_recursive(arg_directory, true);
3097 log_error_errno(k, "Failed to make tree read-only: %m");
3098 _exit(EXIT_FAILURE);
3102 if (mount_all(arg_directory) < 0)
3103 _exit(EXIT_FAILURE);
3105 if (copy_devnodes(arg_directory) < 0)
3106 _exit(EXIT_FAILURE);
3108 if (setup_ptmx(arg_directory) < 0)
3109 _exit(EXIT_FAILURE);
3111 dev_setup(arg_directory);
3113 if (setup_seccomp() < 0)
3114 _exit(EXIT_FAILURE);
3116 if (setup_dev_console(arg_directory, console) < 0)
3117 _exit(EXIT_FAILURE);
3119 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3120 _exit(EXIT_FAILURE);
3122 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3124 if (setup_boot_id(arg_directory) < 0)
3125 _exit(EXIT_FAILURE);
3127 if (setup_timezone(arg_directory) < 0)
3128 _exit(EXIT_FAILURE);
3130 if (setup_resolv_conf(arg_directory) < 0)
3131 _exit(EXIT_FAILURE);
3133 if (setup_journal(arg_directory) < 0)
3134 _exit(EXIT_FAILURE);
3136 if (mount_binds(arg_directory, arg_bind, false) < 0)
3137 _exit(EXIT_FAILURE);
3139 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3140 _exit(EXIT_FAILURE);
3142 if (mount_tmpfs(arg_directory) < 0)
3143 _exit(EXIT_FAILURE);
3145 /* Tell the parent that we are ready, and that
3146 * it can cgroupify us to that we lack access
3147 * to certain devices and resources. */
3148 (void)barrier_place(&barrier);
3150 if (chdir(arg_directory) < 0) {
3151 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3152 _exit(EXIT_FAILURE);
3155 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3156 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3157 _exit(EXIT_FAILURE);
3160 if (chroot(".") < 0) {
3161 log_error_errno(errno, "chroot() failed: %m");
3162 _exit(EXIT_FAILURE);
3165 if (chdir("/") < 0) {
3166 log_error_errno(errno, "chdir() failed: %m");
3167 _exit(EXIT_FAILURE);
3172 if (arg_private_network)
3175 if (drop_capabilities() < 0) {
3176 log_error_errno(errno, "drop_capabilities() failed: %m");
3177 _exit(EXIT_FAILURE);
3180 r = change_uid_gid(&home);
3182 _exit(EXIT_FAILURE);
3184 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3185 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3186 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3188 _exit(EXIT_FAILURE);
3191 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3194 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3196 _exit(EXIT_FAILURE);
3200 if (fdset_size(fds) > 0) {
3201 k = fdset_cloexec(fds, false);
3203 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3204 _exit(EXIT_FAILURE);
3207 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3208 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3210 _exit(EXIT_FAILURE);
3216 if (arg_personality != 0xffffffffLU) {
3217 if (personality(arg_personality) < 0) {
3218 log_error_errno(errno, "personality() failed: %m");
3219 _exit(EXIT_FAILURE);
3221 } else if (secondary) {
3222 if (personality(PER_LINUX32) < 0) {
3223 log_error_errno(errno, "personality() failed: %m");
3224 _exit(EXIT_FAILURE);
3229 if (arg_selinux_context)
3230 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3231 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3232 _exit(EXIT_FAILURE);
3236 if (!strv_isempty(arg_setenv)) {
3239 n = strv_env_merge(2, envp, arg_setenv);
3242 _exit(EXIT_FAILURE);
3247 env_use = (char**) envp;
3249 /* Wait until the parent is ready with the setup, too... */
3250 if (!barrier_place_and_sync(&barrier))
3251 _exit(EXIT_FAILURE);
3257 /* Automatically search for the init system */
3259 l = 1 + argc - optind;
3260 a = newa(char*, l + 1);
3261 memcpy(a + 1, argv + optind, l * sizeof(char*));
3263 a[0] = (char*) "/usr/lib/systemd/systemd";
3264 execve(a[0], a, env_use);
3266 a[0] = (char*) "/lib/systemd/systemd";
3267 execve(a[0], a, env_use);
3269 a[0] = (char*) "/sbin/init";
3270 execve(a[0], a, env_use);
3271 } else if (argc > optind)
3272 execvpe(argv[optind], argv + optind, env_use);
3274 chdir(home ? home : "/root");
3275 execle("/bin/bash", "-bash", NULL, env_use);
3276 execle("/bin/sh", "-sh", NULL, env_use);
3279 log_error_errno(errno, "execv() failed: %m");
3280 _exit(EXIT_FAILURE);
3283 barrier_set_role(&barrier, BARRIER_PARENT);
3287 /* wait for child-setup to be done */
3288 if (barrier_place_and_sync(&barrier)) {
3289 _cleanup_event_unref_ sd_event *event = NULL;
3290 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3293 r = move_network_interfaces(pid);
3297 r = setup_veth(pid, veth_name, &ifi);
3301 r = setup_bridge(veth_name, &ifi);
3305 r = setup_macvlan(pid);
3309 r = register_machine(pid, ifi);
3313 /* Block SIGCHLD here, before notifying child.
3314 * process_pty() will handle it with the other signals. */
3315 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3319 /* Reset signal to default */
3320 r = default_signals(SIGCHLD, -1);
3324 /* Notify the child that the parent is ready with all
3325 * its setup, and that the child can now hand over
3326 * control to the code to run inside the container. */
3327 (void)barrier_place(&barrier);
3329 r = sd_event_new(&event);
3331 log_error_errno(r, "Failed to get default event source: %m");
3336 /* Try to kill the init system on SIGINT or SIGTERM */
3337 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3338 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3340 /* Immediately exit */
3341 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3342 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3345 /* simply exit on sigchld */
3346 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3348 r = pty_forward_new(event, master, &forward);
3350 log_error_errno(r, "Failed to create PTY forwarder: %m");
3354 r = sd_event_loop(event);
3356 return log_error_errno(r, "Failed to run event loop: %m");
3358 forward = pty_forward_free(forward);
3363 /* Kill if it is not dead yet anyway */
3364 terminate_machine(pid);
3367 /* Normally redundant, but better safe than sorry */
3370 r = wait_for_container(pid, &container_status);
3374 /* We failed to wait for the container, or the
3375 * container exited abnormally */
3378 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3379 /* The container exited with a non-zero
3380 * status, or with zero status and no reboot
3384 /* CONTAINER_REBOOTED, loop again */
3386 if (arg_keep_unit) {
3387 /* Special handling if we are running as a
3388 * service: instead of simply restarting the
3389 * machine we want to restart the entire
3390 * service, so let's inform systemd about this
3391 * with the special exit code 133. The service
3392 * file uses RestartForceExitStatus=133 so
3393 * that this results in a full nspawn
3394 * restart. This is necessary since we might
3395 * have cgroup parameters set we want to have
3405 "STATUS=Terminating...");
3407 loop_remove(loop_nr, &image_fd);
3412 free(arg_directory);
3415 strv_free(arg_setenv);
3416 strv_free(arg_network_interfaces);
3417 strv_free(arg_network_macvlan);
3418 strv_free(arg_bind);
3419 strv_free(arg_bind_ro);
3420 strv_free(arg_tmpfs);