1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
49 #include <selinux/selinux.h>
57 #include <blkid/blkid.h>
60 #include "sd-daemon.h"
70 #include "cgroup-util.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
79 #include "bus-error.h"
81 #include "bus-kernel.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
88 #include "siphash24.h"
90 #include "base-filesystem.h"
92 #include "event-util.h"
95 #include "seccomp-util.h"
98 typedef enum ContainerStatus {
103 typedef enum LinkJournal {
110 typedef enum Volatile {
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static uint64_t arg_retain =
128 (1ULL << CAP_CHOWN) |
129 (1ULL << CAP_DAC_OVERRIDE) |
130 (1ULL << CAP_DAC_READ_SEARCH) |
131 (1ULL << CAP_FOWNER) |
132 (1ULL << CAP_FSETID) |
133 (1ULL << CAP_IPC_OWNER) |
135 (1ULL << CAP_LEASE) |
136 (1ULL << CAP_LINUX_IMMUTABLE) |
137 (1ULL << CAP_NET_BIND_SERVICE) |
138 (1ULL << CAP_NET_BROADCAST) |
139 (1ULL << CAP_NET_RAW) |
140 (1ULL << CAP_SETGID) |
141 (1ULL << CAP_SETFCAP) |
142 (1ULL << CAP_SETPCAP) |
143 (1ULL << CAP_SETUID) |
144 (1ULL << CAP_SYS_ADMIN) |
145 (1ULL << CAP_SYS_CHROOT) |
146 (1ULL << CAP_SYS_NICE) |
147 (1ULL << CAP_SYS_PTRACE) |
148 (1ULL << CAP_SYS_TTY_CONFIG) |
149 (1ULL << CAP_SYS_RESOURCE) |
150 (1ULL << CAP_SYS_BOOT) |
151 (1ULL << CAP_AUDIT_WRITE) |
152 (1ULL << CAP_AUDIT_CONTROL) |
154 static char **arg_bind = NULL;
155 static char **arg_bind_ro = NULL;
156 static char **arg_tmpfs = NULL;
157 static char **arg_setenv = NULL;
158 static bool arg_quiet = false;
159 static bool arg_share_system = false;
160 static bool arg_register = true;
161 static bool arg_keep_unit = false;
162 static char **arg_network_interfaces = NULL;
163 static char **arg_network_macvlan = NULL;
164 static bool arg_network_veth = false;
165 static const char *arg_network_bridge = NULL;
166 static unsigned long arg_personality = 0xffffffffLU;
167 static const char *arg_image = NULL;
168 static Volatile arg_volatile = VOLATILE_NO;
170 static void help(void) {
171 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
172 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
173 " -h --help Show this help\n"
174 " --version Print version string\n"
175 " -q --quiet Do not show status information\n"
176 " -D --directory=PATH Root directory for the container\n"
177 " -i --image=PATH File system device or image for the container\n"
178 " -b --boot Boot up full system (i.e. invoke init)\n"
179 " -u --user=USER Run the command under specified user or uid\n"
180 " -M --machine=NAME Set the machine name for the container\n"
181 " --uuid=UUID Set a specific machine UUID for the container\n"
182 " -S --slice=SLICE Place the container in the specified slice\n"
183 " --private-network Disable network in container\n"
184 " --network-interface=INTERFACE\n"
185 " Assign an existing network interface to the\n"
187 " --network-macvlan=INTERFACE\n"
188 " Create a macvlan network interface based on an\n"
189 " existing network interface to the container\n"
190 " --network-veth Add a virtual ethernet connection between host\n"
192 " --network-bridge=INTERFACE\n"
193 " Add a virtual ethernet connection between host\n"
194 " and container and add it to an existing bridge on\n"
196 " -Z --selinux-context=SECLABEL\n"
197 " Set the SELinux security context to be used by\n"
198 " processes in the container\n"
199 " -L --selinux-apifs-context=SECLABEL\n"
200 " Set the SELinux security context to be used by\n"
201 " API/tmpfs file systems in the container\n"
202 " --capability=CAP In addition to the default, retain specified\n"
204 " --drop-capability=CAP Drop the specified capability from the default set\n"
205 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
206 " -j Equivalent to --link-journal=host\n"
207 " --read-only Mount the root directory read-only\n"
208 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
210 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
211 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
212 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
213 " --share-system Share system namespaces with host\n"
214 " --register=BOOLEAN Register container as machine\n"
215 " --keep-unit Do not register a scope for the machine, reuse\n"
216 " the service unit nspawn is running in\n"
217 " --volatile[=MODE] Run the system in volatile mode\n",
218 program_invocation_short_name);
221 static int parse_argv(int argc, char *argv[]) {
238 ARG_NETWORK_INTERFACE,
246 static const struct option options[] = {
247 { "help", no_argument, NULL, 'h' },
248 { "version", no_argument, NULL, ARG_VERSION },
249 { "directory", required_argument, NULL, 'D' },
250 { "user", required_argument, NULL, 'u' },
251 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
252 { "boot", no_argument, NULL, 'b' },
253 { "uuid", required_argument, NULL, ARG_UUID },
254 { "read-only", no_argument, NULL, ARG_READ_ONLY },
255 { "capability", required_argument, NULL, ARG_CAPABILITY },
256 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
257 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
258 { "bind", required_argument, NULL, ARG_BIND },
259 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
260 { "tmpfs", required_argument, NULL, ARG_TMPFS },
261 { "machine", required_argument, NULL, 'M' },
262 { "slice", required_argument, NULL, 'S' },
263 { "setenv", required_argument, NULL, ARG_SETENV },
264 { "selinux-context", required_argument, NULL, 'Z' },
265 { "selinux-apifs-context", required_argument, NULL, 'L' },
266 { "quiet", no_argument, NULL, 'q' },
267 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
268 { "register", required_argument, NULL, ARG_REGISTER },
269 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
270 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
271 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
272 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
273 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
274 { "personality", required_argument, NULL, ARG_PERSONALITY },
275 { "image", required_argument, NULL, 'i' },
276 { "volatile", optional_argument, NULL, ARG_VOLATILE },
281 uint64_t plus = 0, minus = 0;
286 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
295 puts(PACKAGE_STRING);
296 puts(SYSTEMD_FEATURES);
301 arg_directory = canonicalize_file_name(optarg);
302 if (!arg_directory) {
303 log_error("Invalid root directory: %m");
315 arg_user = strdup(optarg);
321 case ARG_NETWORK_BRIDGE:
322 arg_network_bridge = optarg;
326 case ARG_NETWORK_VETH:
327 arg_network_veth = true;
328 arg_private_network = true;
331 case ARG_NETWORK_INTERFACE:
332 if (strv_extend(&arg_network_interfaces, optarg) < 0)
335 arg_private_network = true;
338 case ARG_NETWORK_MACVLAN:
339 if (strv_extend(&arg_network_macvlan, optarg) < 0)
344 case ARG_PRIVATE_NETWORK:
345 arg_private_network = true;
353 r = sd_id128_from_string(optarg, &arg_uuid);
355 log_error("Invalid UUID: %s", optarg);
365 if (isempty(optarg)) {
370 if (!hostname_is_valid(optarg)) {
371 log_error("Invalid machine name: %s", optarg);
376 arg_machine = strdup(optarg);
384 arg_selinux_context = optarg;
388 arg_selinux_apifs_context = optarg;
392 arg_read_only = true;
396 case ARG_DROP_CAPABILITY: {
397 const char *state, *word;
400 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
401 _cleanup_free_ char *t;
404 t = strndup(word, length);
408 if (streq(t, "all")) {
409 if (c == ARG_CAPABILITY)
410 plus = (uint64_t) -1;
412 minus = (uint64_t) -1;
414 if (cap_from_name(t, &cap) < 0) {
415 log_error("Failed to parse capability %s.", t);
419 if (c == ARG_CAPABILITY)
420 plus |= 1ULL << (uint64_t) cap;
422 minus |= 1ULL << (uint64_t) cap;
430 arg_link_journal = LINK_GUEST;
433 case ARG_LINK_JOURNAL:
434 if (streq(optarg, "auto"))
435 arg_link_journal = LINK_AUTO;
436 else if (streq(optarg, "no"))
437 arg_link_journal = LINK_NO;
438 else if (streq(optarg, "guest"))
439 arg_link_journal = LINK_GUEST;
440 else if (streq(optarg, "host"))
441 arg_link_journal = LINK_HOST;
443 log_error("Failed to parse link journal mode %s", optarg);
451 _cleanup_free_ char *a = NULL, *b = NULL;
455 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
457 e = strchr(optarg, ':');
459 a = strndup(optarg, e - optarg);
469 if (!path_is_absolute(a) || !path_is_absolute(b)) {
470 log_error("Invalid bind mount specification: %s", optarg);
474 r = strv_extend(x, a);
478 r = strv_extend(x, b);
486 _cleanup_free_ char *a = NULL, *b = NULL;
489 e = strchr(optarg, ':');
491 a = strndup(optarg, e - optarg);
495 b = strdup("mode=0755");
501 if (!path_is_absolute(a)) {
502 log_error("Invalid tmpfs specification: %s", optarg);
506 r = strv_push(&arg_tmpfs, a);
512 r = strv_push(&arg_tmpfs, b);
524 if (!env_assignment_is_valid(optarg)) {
525 log_error("Environment variable assignment '%s' is not valid.", optarg);
529 n = strv_env_set(arg_setenv, optarg);
533 strv_free(arg_setenv);
542 case ARG_SHARE_SYSTEM:
543 arg_share_system = true;
547 r = parse_boolean(optarg);
549 log_error("Failed to parse --register= argument: %s", optarg);
557 arg_keep_unit = true;
560 case ARG_PERSONALITY:
562 arg_personality = personality_from_string(optarg);
563 if (arg_personality == 0xffffffffLU) {
564 log_error("Unknown or unsupported personality '%s'.", optarg);
573 arg_volatile = VOLATILE_YES;
575 r = parse_boolean(optarg);
577 if (streq(optarg, "state"))
578 arg_volatile = VOLATILE_STATE;
580 log_error("Failed to parse --volatile= argument: %s", optarg);
584 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
593 assert_not_reached("Unhandled option");
596 if (arg_share_system)
597 arg_register = false;
599 if (arg_boot && arg_share_system) {
600 log_error("--boot and --share-system may not be combined.");
604 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
605 log_error("--keep-unit may not be used when invoked from a user session.");
609 if (arg_directory && arg_image) {
610 log_error("--directory= and --image= may not be combined.");
614 if (arg_volatile != VOLATILE_NO && arg_read_only) {
615 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
619 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
624 static int mount_all(const char *dest) {
626 typedef struct MountPoint {
635 static const MountPoint mount_table[] = {
636 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
637 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
638 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
639 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
640 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
641 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
642 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
643 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
645 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
646 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
653 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
654 _cleanup_free_ char *where = NULL;
656 _cleanup_free_ char *options = NULL;
661 where = strjoin(dest, "/", mount_table[k].where, NULL);
665 t = path_is_mount_point(where, true);
667 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
675 /* Skip this entry if it is not a remount. */
676 if (mount_table[k].what && t > 0)
679 t = mkdir_p(where, 0755);
681 if (mount_table[k].fatal) {
682 log_error("Failed to create directory %s: %s", where, strerror(-t));
687 log_warning("Failed to create directory %s: %s", where, strerror(-t));
693 if (arg_selinux_apifs_context &&
694 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
695 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
702 o = mount_table[k].options;
705 if (mount(mount_table[k].what,
708 mount_table[k].flags,
711 if (mount_table[k].fatal) {
712 log_error("mount(%s) failed: %m", where);
717 log_warning("mount(%s) failed: %m", where);
724 static int mount_binds(const char *dest, char **l, bool ro) {
727 STRV_FOREACH_PAIR(x, y, l) {
728 _cleanup_free_ char *where = NULL;
729 struct stat source_st, dest_st;
732 if (stat(*x, &source_st) < 0) {
733 log_error("Failed to stat %s: %m", *x);
737 where = strappend(dest, *y);
741 r = stat(where, &dest_st);
743 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
744 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
747 } else if (errno == ENOENT) {
748 r = mkdir_parents_label(where, 0755);
750 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
754 log_error("Failed to bind mount %s: %m", *x);
758 /* Create the mount point, but be conservative -- refuse to create block
759 * and char devices. */
760 if (S_ISDIR(source_st.st_mode)) {
761 r = mkdir_label(where, 0755);
762 if (r < 0 && errno != EEXIST) {
763 log_error("Failed to create mount point %s: %s", where, strerror(-r));
767 } else if (S_ISFIFO(source_st.st_mode)) {
768 r = mkfifo(where, 0644);
769 if (r < 0 && errno != EEXIST) {
770 log_error("Failed to create mount point %s: %m", where);
774 } else if (S_ISSOCK(source_st.st_mode)) {
775 r = mknod(where, 0644 | S_IFSOCK, 0);
776 if (r < 0 && errno != EEXIST) {
777 log_error("Failed to create mount point %s: %m", where);
781 } else if (S_ISREG(source_st.st_mode)) {
784 log_error("Failed to create mount point %s: %s", where, strerror(-r));
789 log_error("Refusing to create mountpoint for file: %s", *x);
793 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
794 log_error("mount(%s) failed: %m", where);
799 r = bind_remount_recursive(where, true);
801 log_error("Read-Only bind mount failed: %s", strerror(-r));
810 static int mount_tmpfs(const char *dest) {
813 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
814 _cleanup_free_ char *where = NULL;
817 where = strappend(dest, *i);
821 r = mkdir_label(where, 0755);
823 log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
828 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
829 log_error("tmpfs mount to %s failed: %m", where);
837 static int setup_timezone(const char *dest) {
838 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
844 /* Fix the timezone, if possible */
845 r = readlink_malloc("/etc/localtime", &p);
847 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
851 z = path_startswith(p, "../usr/share/zoneinfo/");
853 z = path_startswith(p, "/usr/share/zoneinfo/");
855 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
859 where = strappend(dest, "/etc/localtime");
863 r = readlink_malloc(where, &q);
865 y = path_startswith(q, "../usr/share/zoneinfo/");
867 y = path_startswith(q, "/usr/share/zoneinfo/");
869 /* Already pointing to the right place? Then do nothing .. */
870 if (y && streq(y, z))
874 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
878 if (access(check, F_OK) < 0) {
879 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
883 what = strappend("../usr/share/zoneinfo/", z);
887 r = mkdir_parents(where, 0755);
889 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
895 if (r < 0 && errno != ENOENT) {
896 log_error("Failed to remove existing timezone info %s in container: %m", where);
901 if (symlink(what, where) < 0) {
902 log_error("Failed to correct timezone of container: %m");
909 static int setup_resolv_conf(const char *dest) {
910 _cleanup_free_ char *where = NULL;
915 if (arg_private_network)
918 /* Fix resolv.conf, if possible */
919 where = strappend(dest, "/etc/resolv.conf");
923 /* We don't really care for the results of this really. If it
924 * fails, it fails, but meh... */
925 r = mkdir_parents(where, 0755);
927 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
932 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
934 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
942 static int setup_volatile_state(const char *directory) {
948 if (arg_volatile != VOLATILE_STATE)
951 /* --volatile=state means we simply overmount /var
952 with a tmpfs, and the rest read-only. */
954 r = bind_remount_recursive(directory, true);
956 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
960 p = strappenda(directory, "/var");
962 if (r < 0 && errno != EEXIST) {
963 log_error("Failed to create %s: %m", directory);
967 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
968 log_error("Failed to mount tmpfs to /var: %m");
975 static int setup_volatile(const char *directory) {
976 bool tmpfs_mounted = false, bind_mounted = false;
977 char template[] = "/tmp/nspawn-volatile-XXXXXX";
983 if (arg_volatile != VOLATILE_YES)
986 /* --volatile=yes means we mount a tmpfs to the root dir, and
987 the original /usr to use inside it, and that read-only. */
989 if (!mkdtemp(template)) {
990 log_error("Failed to create temporary directory: %m");
994 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
995 log_error("Failed to mount tmpfs for root directory: %m");
1000 tmpfs_mounted = true;
1002 f = strappenda(directory, "/usr");
1003 t = strappenda(template, "/usr");
1006 if (r < 0 && errno != EEXIST) {
1007 log_error("Failed to create %s: %m", t);
1012 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1013 log_error("Failed to create /usr bind mount: %m");
1018 bind_mounted = true;
1020 r = bind_remount_recursive(t, true);
1022 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1026 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1027 log_error("Failed to move root mount: %m");
1045 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1048 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1049 SD_ID128_FORMAT_VAL(id));
1054 static int setup_boot_id(const char *dest) {
1055 _cleanup_free_ char *from = NULL, *to = NULL;
1056 sd_id128_t rnd = {};
1062 if (arg_share_system)
1065 /* Generate a new randomized boot ID, so that each boot-up of
1066 * the container gets a new one */
1068 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1069 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1073 r = sd_id128_randomize(&rnd);
1075 log_error("Failed to generate random boot id: %s", strerror(-r));
1079 id128_format_as_uuid(rnd, as_uuid);
1081 r = write_string_file(from, as_uuid);
1083 log_error("Failed to write boot id: %s", strerror(-r));
1087 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1088 log_error("Failed to bind mount boot id: %m");
1090 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1091 log_warning("Failed to make boot id read-only: %m");
1097 static int copy_devnodes(const char *dest) {
1099 static const char devnodes[] =
1110 _cleanup_umask_ mode_t u;
1116 NULSTR_FOREACH(d, devnodes) {
1117 _cleanup_free_ char *from = NULL, *to = NULL;
1120 from = strappend("/dev/", d);
1121 to = strjoin(dest, "/dev/", d, NULL);
1125 if (stat(from, &st) < 0) {
1127 if (errno != ENOENT) {
1128 log_error("Failed to stat %s: %m", from);
1132 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1134 log_error("%s is not a char or block device, cannot copy", from);
1138 r = mkdir_parents(to, 0775);
1140 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1144 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1145 log_error("mknod(%s) failed: %m", dest);
1154 static int setup_ptmx(const char *dest) {
1155 _cleanup_free_ char *p = NULL;
1157 p = strappend(dest, "/dev/ptmx");
1161 if (symlink("pts/ptmx", p) < 0) {
1162 log_error("Failed to create /dev/ptmx symlink: %m");
1169 static int setup_dev_console(const char *dest, const char *console) {
1170 _cleanup_umask_ mode_t u;
1180 if (stat("/dev/null", &st) < 0) {
1181 log_error("Failed to stat /dev/null: %m");
1185 r = chmod_and_chown(console, 0600, 0, 0);
1187 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1191 /* We need to bind mount the right tty to /dev/console since
1192 * ptys can only exist on pts file systems. To have something
1193 * to bind mount things on we create a device node first, and
1194 * use /dev/null for that since we the cgroups device policy
1195 * allows us to create that freely, while we cannot create
1196 * /dev/console. (Note that the major minor doesn't actually
1197 * matter here, since we mount it over anyway). */
1199 to = strappenda(dest, "/dev/console");
1200 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1201 log_error("mknod() for /dev/console failed: %m");
1205 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1206 log_error("Bind mount for /dev/console failed: %m");
1213 static int setup_kmsg(const char *dest, int kmsg_socket) {
1214 _cleanup_free_ char *from = NULL, *to = NULL;
1216 _cleanup_umask_ mode_t u;
1218 struct cmsghdr cmsghdr;
1219 uint8_t buf[CMSG_SPACE(sizeof(int))];
1221 struct msghdr mh = {
1222 .msg_control = &control,
1223 .msg_controllen = sizeof(control),
1225 struct cmsghdr *cmsg;
1228 assert(kmsg_socket >= 0);
1232 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1233 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1234 * on the reading side behave very similar to /proc/kmsg,
1235 * their writing side behaves differently from /dev/kmsg in
1236 * that writing blocks when nothing is reading. In order to
1237 * avoid any problems with containers deadlocking due to this
1238 * we simply make /dev/kmsg unavailable to the container. */
1239 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1240 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1243 if (mkfifo(from, 0600) < 0) {
1244 log_error("mkfifo() for /dev/kmsg failed: %m");
1248 r = chmod_and_chown(from, 0600, 0, 0);
1250 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1254 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1255 log_error("Bind mount for /proc/kmsg failed: %m");
1259 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1261 log_error("Failed to open fifo: %m");
1265 cmsg = CMSG_FIRSTHDR(&mh);
1266 cmsg->cmsg_level = SOL_SOCKET;
1267 cmsg->cmsg_type = SCM_RIGHTS;
1268 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1269 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1271 mh.msg_controllen = cmsg->cmsg_len;
1273 /* Store away the fd in the socket, so that it stays open as
1274 * long as we run the child */
1275 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1279 log_error("Failed to send FIFO fd: %m");
1283 /* And now make the FIFO unavailable as /dev/kmsg... */
1288 static int setup_hostname(void) {
1290 if (arg_share_system)
1293 if (sethostname_idempotent(arg_machine) < 0)
1299 static int setup_journal(const char *directory) {
1300 sd_id128_t machine_id, this_id;
1301 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1305 p = strappend(directory, "/etc/machine-id");
1309 r = read_one_line_file(p, &b);
1310 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1313 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1318 if (isempty(id) && arg_link_journal == LINK_AUTO)
1321 /* Verify validity */
1322 r = sd_id128_from_string(id, &machine_id);
1324 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1328 r = sd_id128_get_machine(&this_id);
1330 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1334 if (sd_id128_equal(machine_id, this_id)) {
1335 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1336 "Host and machine ids are equal (%s): refusing to link journals", id);
1337 if (arg_link_journal == LINK_AUTO)
1343 if (arg_link_journal == LINK_NO)
1347 p = strappend("/var/log/journal/", id);
1348 q = strjoin(directory, "/var/log/journal/", id, NULL);
1352 if (path_is_mount_point(p, false) > 0) {
1353 if (arg_link_journal != LINK_AUTO) {
1354 log_error("%s: already a mount point, refusing to use for journal", p);
1361 if (path_is_mount_point(q, false) > 0) {
1362 if (arg_link_journal != LINK_AUTO) {
1363 log_error("%s: already a mount point, refusing to use for journal", q);
1370 r = readlink_and_make_absolute(p, &d);
1372 if ((arg_link_journal == LINK_GUEST ||
1373 arg_link_journal == LINK_AUTO) &&
1376 r = mkdir_p(q, 0755);
1378 log_warning("Failed to create directory %s: %m", q);
1382 if (unlink(p) < 0) {
1383 log_error("Failed to remove symlink %s: %m", p);
1386 } else if (r == -EINVAL) {
1388 if (arg_link_journal == LINK_GUEST &&
1391 if (errno == ENOTDIR) {
1392 log_error("%s already exists and is neither a symlink nor a directory", p);
1395 log_error("Failed to remove %s: %m", p);
1399 } else if (r != -ENOENT) {
1400 log_error("readlink(%s) failed: %m", p);
1404 if (arg_link_journal == LINK_GUEST) {
1406 if (symlink(q, p) < 0) {
1407 log_error("Failed to symlink %s to %s: %m", q, p);
1411 r = mkdir_p(q, 0755);
1413 log_warning("Failed to create directory %s: %m", q);
1417 if (arg_link_journal == LINK_HOST) {
1418 r = mkdir_p(p, 0755);
1420 log_error("Failed to create %s: %m", p);
1424 } else if (access(p, F_OK) < 0)
1427 if (dir_is_empty(q) == 0)
1428 log_warning("%s is not empty, proceeding anyway.", q);
1430 r = mkdir_p(q, 0755);
1432 log_error("Failed to create %s: %m", q);
1436 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1437 log_error("Failed to bind mount journal from host into guest: %m");
1444 static int drop_capabilities(void) {
1445 return capability_bounding_set_drop(~arg_retain, false);
1448 static int register_machine(pid_t pid, int local_ifindex) {
1449 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1450 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1456 r = sd_bus_default_system(&bus);
1458 log_error("Failed to open system bus: %s", strerror(-r));
1462 if (arg_keep_unit) {
1463 r = sd_bus_call_method(
1465 "org.freedesktop.machine1",
1466 "/org/freedesktop/machine1",
1467 "org.freedesktop.machine1.Manager",
1468 "RegisterMachineWithNetwork",
1473 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1477 strempty(arg_directory),
1478 local_ifindex > 0 ? 1 : 0, local_ifindex);
1480 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1482 r = sd_bus_message_new_method_call(
1485 "org.freedesktop.machine1",
1486 "/org/freedesktop/machine1",
1487 "org.freedesktop.machine1.Manager",
1488 "CreateMachineWithNetwork");
1490 log_error("Failed to create message: %s", strerror(-r));
1494 r = sd_bus_message_append(
1498 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1502 strempty(arg_directory),
1503 local_ifindex > 0 ? 1 : 0, local_ifindex);
1505 log_error("Failed to append message arguments: %s", strerror(-r));
1509 r = sd_bus_message_open_container(m, 'a', "(sv)");
1511 log_error("Failed to open container: %s", strerror(-r));
1515 if (!isempty(arg_slice)) {
1516 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1518 log_error("Failed to append slice: %s", strerror(-r));
1523 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1525 log_error("Failed to add device policy: %s", strerror(-r));
1529 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1530 /* Allow the container to
1531 * access and create the API
1532 * device nodes, so that
1533 * PrivateDevices= in the
1534 * container can work
1539 "/dev/random", "rwm",
1540 "/dev/urandom", "rwm",
1542 "/dev/net/tun", "rwm",
1543 /* Allow the container
1544 * access to ptys. However,
1546 * container to ever create
1547 * these device nodes. */
1548 "/dev/pts/ptmx", "rw",
1551 log_error("Failed to add device whitelist: %s", strerror(-r));
1555 r = sd_bus_message_close_container(m);
1557 log_error("Failed to close container: %s", strerror(-r));
1561 r = sd_bus_call(bus, m, 0, &error, NULL);
1565 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1572 static int terminate_machine(pid_t pid) {
1573 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1574 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1575 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1582 r = sd_bus_default_system(&bus);
1584 log_error("Failed to open system bus: %s", strerror(-r));
1588 r = sd_bus_call_method(
1590 "org.freedesktop.machine1",
1591 "/org/freedesktop/machine1",
1592 "org.freedesktop.machine1.Manager",
1599 /* Note that the machine might already have been
1600 * cleaned up automatically, hence don't consider it a
1601 * failure if we cannot get the machine object. */
1602 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1606 r = sd_bus_message_read(reply, "o", &path);
1608 return bus_log_parse_error(r);
1610 r = sd_bus_call_method(
1612 "org.freedesktop.machine1",
1614 "org.freedesktop.machine1.Machine",
1620 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1627 static int reset_audit_loginuid(void) {
1628 _cleanup_free_ char *p = NULL;
1631 if (arg_share_system)
1634 r = read_one_line_file("/proc/self/loginuid", &p);
1638 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1642 /* Already reset? */
1643 if (streq(p, "4294967295"))
1646 r = write_string_file("/proc/self/loginuid", "4294967295");
1648 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1649 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1650 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1651 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1652 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1660 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1661 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1663 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1670 l = strlen(arg_machine);
1671 sz = sizeof(sd_id128_t) + l;
1674 /* fetch some persistent data unique to the host */
1675 r = sd_id128_get_machine((sd_id128_t*) v);
1679 /* combine with some data unique (on this host) to this
1680 * container instance */
1681 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1683 /* Let's hash the host machine ID plus the container name. We
1684 * use a fixed, but originally randomly created hash key here. */
1685 siphash24(result, v, sz, hash_key.bytes);
1687 assert_cc(ETH_ALEN <= sizeof(result));
1688 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1690 /* see eth_random_addr in the kernel */
1691 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1692 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1697 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1698 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1699 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1700 struct ether_addr mac_host, mac_container;
1703 if (!arg_private_network)
1706 if (!arg_network_veth)
1709 /* Use two different interface name prefixes depending whether
1710 * we are in bridge mode or not. */
1711 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1712 arg_network_bridge ? "vb" : "ve", arg_machine);
1714 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1716 log_error("Failed to generate predictable MAC address for container side");
1720 r = generate_mac(&mac_host, HOST_HASH_KEY);
1722 log_error("Failed to generate predictable MAC address for host side");
1726 r = sd_rtnl_open(&rtnl, 0);
1728 log_error("Failed to connect to netlink: %s", strerror(-r));
1732 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1734 log_error("Failed to allocate netlink message: %s", strerror(-r));
1738 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1740 log_error("Failed to add netlink interface name: %s", strerror(-r));
1744 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1746 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1750 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1752 log_error("Failed to open netlink container: %s", strerror(-r));
1756 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1758 log_error("Failed to open netlink container: %s", strerror(-r));
1762 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1764 log_error("Failed to open netlink container: %s", strerror(-r));
1768 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1770 log_error("Failed to add netlink interface name: %s", strerror(-r));
1774 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1776 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1780 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1782 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1786 r = sd_rtnl_message_close_container(m);
1788 log_error("Failed to close netlink container: %s", strerror(-r));
1792 r = sd_rtnl_message_close_container(m);
1794 log_error("Failed to close netlink container: %s", strerror(-r));
1798 r = sd_rtnl_message_close_container(m);
1800 log_error("Failed to close netlink container: %s", strerror(-r));
1804 r = sd_rtnl_call(rtnl, m, 0, NULL);
1806 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1810 i = (int) if_nametoindex(iface_name);
1812 log_error("Failed to resolve interface %s: %m", iface_name);
1821 static int setup_bridge(const char veth_name[], int *ifi) {
1822 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1823 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1826 if (!arg_private_network)
1829 if (!arg_network_veth)
1832 if (!arg_network_bridge)
1835 bridge = (int) if_nametoindex(arg_network_bridge);
1837 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1843 r = sd_rtnl_open(&rtnl, 0);
1845 log_error("Failed to connect to netlink: %s", strerror(-r));
1849 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1851 log_error("Failed to allocate netlink message: %s", strerror(-r));
1855 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1857 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1861 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1863 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1867 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1869 log_error("Failed to add netlink master field: %s", strerror(-r));
1873 r = sd_rtnl_call(rtnl, m, 0, NULL);
1875 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1882 static int parse_interface(struct udev *udev, const char *name) {
1883 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1884 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1887 ifi = (int) if_nametoindex(name);
1889 log_error("Failed to resolve interface %s: %m", name);
1893 sprintf(ifi_str, "n%i", ifi);
1894 d = udev_device_new_from_device_id(udev, ifi_str);
1896 log_error("Failed to get udev device for interface %s: %m", name);
1900 if (udev_device_get_is_initialized(d) <= 0) {
1901 log_error("Network interface %s is not initialized yet.", name);
1908 static int move_network_interfaces(pid_t pid) {
1909 _cleanup_udev_unref_ struct udev *udev = NULL;
1910 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1914 if (!arg_private_network)
1917 if (strv_isempty(arg_network_interfaces))
1920 r = sd_rtnl_open(&rtnl, 0);
1922 log_error("Failed to connect to netlink: %s", strerror(-r));
1928 log_error("Failed to connect to udev.");
1932 STRV_FOREACH(i, arg_network_interfaces) {
1933 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1936 ifi = parse_interface(udev, *i);
1940 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1942 log_error("Failed to allocate netlink message: %s", strerror(-r));
1946 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1948 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1952 r = sd_rtnl_call(rtnl, m, 0, NULL);
1954 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1962 static int setup_macvlan(pid_t pid) {
1963 _cleanup_udev_unref_ struct udev *udev = NULL;
1964 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1968 if (!arg_private_network)
1971 if (strv_isempty(arg_network_macvlan))
1974 r = sd_rtnl_open(&rtnl, 0);
1976 log_error("Failed to connect to netlink: %s", strerror(-r));
1982 log_error("Failed to connect to udev.");
1986 STRV_FOREACH(i, arg_network_macvlan) {
1987 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1988 _cleanup_free_ char *n = NULL;
1991 ifi = parse_interface(udev, *i);
1995 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1997 log_error("Failed to allocate netlink message: %s", strerror(-r));
2001 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2003 log_error("Failed to add netlink interface index: %s", strerror(-r));
2007 n = strappend("mv-", *i);
2011 strshorten(n, IFNAMSIZ-1);
2013 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2015 log_error("Failed to add netlink interface name: %s", strerror(-r));
2019 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2021 log_error("Failed to add netlink namespace field: %s", strerror(-r));
2025 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2027 log_error("Failed to open netlink container: %s", strerror(-r));
2031 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2033 log_error("Failed to open netlink container: %s", strerror(-r));
2037 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2039 log_error("Failed to append macvlan mode: %s", strerror(-r));
2043 r = sd_rtnl_message_close_container(m);
2045 log_error("Failed to close netlink container: %s", strerror(-r));
2049 r = sd_rtnl_message_close_container(m);
2051 log_error("Failed to close netlink container: %s", strerror(-r));
2055 r = sd_rtnl_call(rtnl, m, 0, NULL);
2057 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2065 static int setup_seccomp(void) {
2068 static const int blacklist[] = {
2069 SCMP_SYS(kexec_load),
2070 SCMP_SYS(open_by_handle_at),
2071 SCMP_SYS(init_module),
2072 SCMP_SYS(finit_module),
2073 SCMP_SYS(delete_module),
2080 scmp_filter_ctx seccomp;
2084 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2088 r = seccomp_add_secondary_archs(seccomp);
2090 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2094 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2095 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2097 continue; /* unknown syscall */
2099 log_error("Failed to block syscall: %s", strerror(-r));
2105 Audit is broken in containers, much of the userspace audit
2106 hookup will fail if running inside a container. We don't
2107 care and just turn off creation of audit sockets.
2109 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2110 with EAFNOSUPPORT which audit userspace uses as indication
2111 that audit is disabled in the kernel.
2114 r = seccomp_rule_add(
2116 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2119 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2120 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2122 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2126 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2128 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2132 r = seccomp_load(seccomp);
2134 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2137 seccomp_release(seccomp);
2145 static int setup_image(char **device_path, int *loop_nr) {
2146 struct loop_info64 info = {
2147 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2149 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2150 _cleanup_free_ char* loopdev = NULL;
2154 assert(device_path);
2157 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2159 log_error("Failed to open %s: %m", arg_image);
2163 if (fstat(fd, &st) < 0) {
2164 log_error("Failed to stat %s: %m", arg_image);
2168 if (S_ISBLK(st.st_mode)) {
2171 p = strdup(arg_image);
2185 if (!S_ISREG(st.st_mode)) {
2186 log_error("%s is not a regular file or block device: %m", arg_image);
2190 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2192 log_error("Failed to open /dev/loop-control: %m");
2196 nr = ioctl(control, LOOP_CTL_GET_FREE);
2198 log_error("Failed to allocate loop device: %m");
2202 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2205 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2207 log_error("Failed to open loop device %s: %m", loopdev);
2211 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2212 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2217 info.lo_flags |= LO_FLAGS_READ_ONLY;
2219 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2220 log_error("Failed to set loopback settings on %s: %m", loopdev);
2224 *device_path = loopdev;
2235 static int dissect_image(
2237 char **root_device, bool *root_device_rw,
2238 char **home_device, bool *home_device_rw,
2239 char **srv_device, bool *srv_device_rw,
2243 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2244 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2245 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2246 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2247 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2248 _cleanup_udev_unref_ struct udev *udev = NULL;
2249 struct udev_list_entry *first, *item;
2250 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2251 const char *pttype = NULL;
2257 assert(root_device);
2258 assert(home_device);
2262 b = blkid_new_probe();
2267 r = blkid_probe_set_device(b, fd, 0, 0);
2272 log_error("Failed to set device on blkid probe: %m");
2276 blkid_probe_enable_partitions(b, 1);
2277 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2280 r = blkid_do_safeprobe(b);
2281 if (r == -2 || r == 1) {
2282 log_error("Failed to identify any partition table on %s.\n"
2283 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2285 } else if (r != 0) {
2288 log_error("Failed to probe: %m");
2292 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2293 if (!streq_ptr(pttype, "gpt")) {
2294 log_error("Image %s does not carry a GUID Partition Table.\n"
2295 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2300 pl = blkid_probe_get_partitions(b);
2305 log_error("Failed to list partitions of %s", arg_image);
2313 if (fstat(fd, &st) < 0) {
2314 log_error("Failed to stat block device: %m");
2318 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2322 e = udev_enumerate_new(udev);
2326 r = udev_enumerate_add_match_parent(e, d);
2330 r = udev_enumerate_scan_devices(e);
2332 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2336 first = udev_enumerate_get_list_entry(e);
2337 udev_list_entry_foreach(item, first) {
2338 _cleanup_udev_device_unref_ struct udev_device *q;
2339 const char *stype, *node;
2340 unsigned long long flags;
2347 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2352 log_error("Failed to get partition device of %s: %m", arg_image);
2356 qn = udev_device_get_devnum(q);
2360 if (st.st_rdev == qn)
2363 node = udev_device_get_devnode(q);
2367 pp = blkid_partlist_devno_to_partition(pl, qn);
2371 flags = blkid_partition_get_flags(pp);
2372 if (flags & GPT_FLAG_NO_AUTO)
2375 nr = blkid_partition_get_partno(pp);
2379 stype = blkid_partition_get_type_string(pp);
2383 if (sd_id128_from_string(stype, &type_id) < 0)
2386 if (sd_id128_equal(type_id, GPT_HOME)) {
2388 if (home && nr >= home_nr)
2392 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2395 home = strdup(node);
2398 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2400 if (srv && nr >= srv_nr)
2404 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2411 #ifdef GPT_ROOT_NATIVE
2412 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2414 if (root && nr >= root_nr)
2418 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2421 root = strdup(node);
2426 #ifdef GPT_ROOT_SECONDARY
2427 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2429 if (secondary_root && nr >= secondary_root_nr)
2432 secondary_root_nr = nr;
2433 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2436 free(secondary_root);
2437 secondary_root = strdup(node);
2438 if (!secondary_root)
2444 if (!root && !secondary_root) {
2445 log_error("Failed to identify root partition in disk image %s.\n"
2446 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2451 *root_device = root;
2454 *root_device_rw = root_rw;
2456 } else if (secondary_root) {
2457 *root_device = secondary_root;
2458 secondary_root = NULL;
2460 *root_device_rw = secondary_root_rw;
2465 *home_device = home;
2468 *home_device_rw = home_rw;
2475 *srv_device_rw = srv_rw;
2480 log_error("--image= is not supported, compiled without blkid support.");
2485 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2487 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2488 const char *fstype, *p;
2498 p = strappenda(where, directory);
2503 b = blkid_new_probe_from_filename(what);
2507 log_error("Failed to allocate prober for %s: %m", what);
2511 blkid_probe_enable_superblocks(b, 1);
2512 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2515 r = blkid_do_safeprobe(b);
2516 if (r == -1 || r == 1) {
2517 log_error("Cannot determine file system type of %s", what);
2519 } else if (r != 0) {
2522 log_error("Failed to probe %s: %m", what);
2527 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2530 log_error("Failed to determine file system type of %s", what);
2534 if (streq(fstype, "crypto_LUKS")) {
2535 log_error("nspawn currently does not support LUKS disk images.");
2539 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2540 log_error("Failed to mount %s: %m", what);
2546 log_error("--image= is not supported, compiled without blkid support.");
2551 static int mount_devices(
2553 const char *root_device, bool root_device_rw,
2554 const char *home_device, bool home_device_rw,
2555 const char *srv_device, bool srv_device_rw) {
2561 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2563 log_error("Failed to mount root directory: %s", strerror(-r));
2569 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2571 log_error("Failed to mount home directory: %s", strerror(-r));
2577 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2579 log_error("Failed to mount server data directory: %s", strerror(-r));
2587 static void loop_remove(int nr, int *image_fd) {
2588 _cleanup_close_ int control = -1;
2594 if (image_fd && *image_fd >= 0) {
2595 r = ioctl(*image_fd, LOOP_CLR_FD);
2597 log_warning("Failed to close loop image: %m");
2598 *image_fd = safe_close(*image_fd);
2601 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2603 log_warning("Failed to open /dev/loop-control: %m");
2607 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2609 log_warning("Failed to remove loop %d: %m", nr);
2612 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2620 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2621 log_error("Failed to allocate pipe: %m");
2627 log_error("Failed to fork getent child: %m");
2629 } else if (pid == 0) {
2631 char *empty_env = NULL;
2633 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2634 _exit(EXIT_FAILURE);
2636 if (pipe_fds[0] > 2)
2637 safe_close(pipe_fds[0]);
2638 if (pipe_fds[1] > 2)
2639 safe_close(pipe_fds[1]);
2641 nullfd = open("/dev/null", O_RDWR);
2643 _exit(EXIT_FAILURE);
2645 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2646 _exit(EXIT_FAILURE);
2648 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2649 _exit(EXIT_FAILURE);
2654 reset_all_signal_handlers();
2655 close_all_fds(NULL, 0);
2657 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2658 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2659 _exit(EXIT_FAILURE);
2662 pipe_fds[1] = safe_close(pipe_fds[1]);
2669 static int change_uid_gid(char **_home) {
2670 char line[LINE_MAX], *x, *u, *g, *h;
2671 const char *word, *state;
2672 _cleanup_free_ uid_t *uids = NULL;
2673 _cleanup_free_ char *home = NULL;
2674 _cleanup_fclose_ FILE *f = NULL;
2675 _cleanup_close_ int fd = -1;
2676 unsigned n_uids = 0;
2685 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2686 /* Reset everything fully to 0, just in case */
2688 if (setgroups(0, NULL) < 0) {
2689 log_error("setgroups() failed: %m");
2693 if (setresgid(0, 0, 0) < 0) {
2694 log_error("setregid() failed: %m");
2698 if (setresuid(0, 0, 0) < 0) {
2699 log_error("setreuid() failed: %m");
2707 /* First, get user credentials */
2708 fd = spawn_getent("passwd", arg_user, &pid);
2712 f = fdopen(fd, "r");
2717 if (!fgets(line, sizeof(line), f)) {
2720 log_error("Failed to resolve user %s.", arg_user);
2724 log_error("Failed to read from getent: %m");
2730 wait_for_terminate_and_warn("getent passwd", pid);
2732 x = strchr(line, ':');
2734 log_error("/etc/passwd entry has invalid user field.");
2738 u = strchr(x+1, ':');
2740 log_error("/etc/passwd entry has invalid password field.");
2747 log_error("/etc/passwd entry has invalid UID field.");
2755 log_error("/etc/passwd entry has invalid GID field.");
2760 h = strchr(x+1, ':');
2762 log_error("/etc/passwd entry has invalid GECOS field.");
2769 log_error("/etc/passwd entry has invalid home directory field.");
2775 r = parse_uid(u, &uid);
2777 log_error("Failed to parse UID of user.");
2781 r = parse_gid(g, &gid);
2783 log_error("Failed to parse GID of user.");
2791 /* Second, get group memberships */
2792 fd = spawn_getent("initgroups", arg_user, &pid);
2797 f = fdopen(fd, "r");
2802 if (!fgets(line, sizeof(line), f)) {
2804 log_error("Failed to resolve user %s.", arg_user);
2808 log_error("Failed to read from getent: %m");
2814 wait_for_terminate_and_warn("getent initgroups", pid);
2816 /* Skip over the username and subsequent separator whitespace */
2818 x += strcspn(x, WHITESPACE);
2819 x += strspn(x, WHITESPACE);
2821 FOREACH_WORD(word, l, x, state) {
2827 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2830 r = parse_uid(c, &uids[n_uids++]);
2832 log_error("Failed to parse group data from getent.");
2837 r = mkdir_parents(home, 0775);
2839 log_error("Failed to make home root directory: %s", strerror(-r));
2843 r = mkdir_safe(home, 0755, uid, gid);
2844 if (r < 0 && r != -EEXIST) {
2845 log_error("Failed to make home directory: %s", strerror(-r));
2849 fchown(STDIN_FILENO, uid, gid);
2850 fchown(STDOUT_FILENO, uid, gid);
2851 fchown(STDERR_FILENO, uid, gid);
2853 if (setgroups(n_uids, uids) < 0) {
2854 log_error("Failed to set auxiliary groups: %m");
2858 if (setresgid(gid, gid, gid) < 0) {
2859 log_error("setregid() failed: %m");
2863 if (setresuid(uid, uid, uid) < 0) {
2864 log_error("setreuid() failed: %m");
2878 * < 0 : wait_for_terminate() failed to get the state of the
2879 * container, the container was terminated by a signal, or
2880 * failed for an unknown reason. No change is made to the
2881 * container argument.
2882 * > 0 : The program executed in the container terminated with an
2883 * error. The exit code of the program executed in the
2884 * container is returned. The container argument has been set
2885 * to CONTAINER_TERMINATED.
2886 * 0 : The container is being rebooted, has been shut down or exited
2887 * successfully. The container argument has been set to either
2888 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2890 * That is, success is indicated by a return value of zero, and an
2891 * error is indicated by a non-zero value.
2893 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2897 r = wait_for_terminate(pid, &status);
2899 log_warning("Failed to wait for container: %s", strerror(-r));
2903 switch (status.si_code) {
2906 if (status.si_status == 0) {
2907 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2910 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2912 *container = CONTAINER_TERMINATED;
2913 return status.si_status;
2916 if (status.si_status == SIGINT) {
2918 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2919 *container = CONTAINER_TERMINATED;
2922 } else if (status.si_status == SIGHUP) {
2924 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2925 *container = CONTAINER_REBOOTED;
2929 /* CLD_KILLED fallthrough */
2932 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2936 log_error("Container %s failed due to unknown reason.", arg_machine);
2943 static void nop_handler(int sig) {}
2945 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2948 pid = PTR_TO_UINT32(userdata);
2950 if (kill(pid, SIGRTMIN+3) >= 0) {
2951 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2952 sd_event_source_set_userdata(s, NULL);
2957 sd_event_exit(sd_event_source_get_event(s), 0);
2961 int main(int argc, char *argv[]) {
2963 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2964 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2965 _cleanup_close_ int master = -1, image_fd = -1;
2966 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2967 _cleanup_fdset_free_ FDSet *fds = NULL;
2968 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2969 const char *console = NULL;
2970 char veth_name[IFNAMSIZ];
2971 bool secondary = false;
2972 sigset_t mask, mask_chld;
2975 log_parse_environment();
2978 k = parse_argv(argc, argv);
2987 if (arg_directory) {
2990 p = path_make_absolute_cwd(arg_directory);
2991 free(arg_directory);
2994 arg_directory = get_current_dir_name();
2996 if (!arg_directory) {
2997 log_error("Failed to determine path, please use -D.");
3000 path_kill_slashes(arg_directory);
3004 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3010 hostname_cleanup(arg_machine, false);
3011 if (isempty(arg_machine)) {
3012 log_error("Failed to determine machine name automatically, please use -M.");
3017 if (geteuid() != 0) {
3018 log_error("Need to be root.");
3022 if (sd_booted() <= 0) {
3023 log_error("Not running on a systemd system.");
3028 n_fd_passed = sd_listen_fds(false);
3029 if (n_fd_passed > 0) {
3030 k = fdset_new_listen_fds(&fds, false);
3032 log_error("Failed to collect file descriptors: %s", strerror(-k));
3036 fdset_close_others(fds);
3039 if (arg_directory) {
3040 if (path_equal(arg_directory, "/")) {
3041 log_error("Spawning container on root directory not supported.");
3046 if (path_is_os_tree(arg_directory) <= 0) {
3047 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3053 p = strappenda(arg_directory,
3054 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3055 if (access(p, F_OK) < 0) {
3056 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3062 char template[] = "/tmp/nspawn-root-XXXXXX";
3064 if (!mkdtemp(template)) {
3065 log_error("Failed to create temporary directory: %m");
3070 arg_directory = strdup(template);
3071 if (!arg_directory) {
3076 image_fd = setup_image(&device_path, &loop_nr);
3082 r = dissect_image(image_fd,
3083 &root_device, &root_device_rw,
3084 &home_device, &home_device_rw,
3085 &srv_device, &srv_device_rw,
3091 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3093 log_error("Failed to acquire pseudo tty: %m");
3097 console = ptsname(master);
3099 log_error("Failed to determine tty name: %m");
3104 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3105 arg_machine, arg_image ? arg_image : arg_directory);
3107 if (unlockpt(master) < 0) {
3108 log_error("Failed to unlock tty: %m");
3112 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3113 log_error("Failed to create kmsg socket pair: %m");
3119 "STATUS=Container running.");
3121 assert_se(sigemptyset(&mask) == 0);
3122 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3123 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3125 assert_se(sigemptyset(&mask_chld) == 0);
3126 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3129 ContainerStatus container_status;
3130 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3131 struct sigaction sa = {
3132 .sa_handler = nop_handler,
3133 .sa_flags = SA_NOCLDSTOP,
3136 r = barrier_create(&barrier);
3138 log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3142 /* Child can be killed before execv(), so handle SIGCHLD
3143 * in order to interrupt parent's blocking calls and
3144 * give it a chance to call wait() and terminate. */
3145 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3147 log_error("Failed to change the signal mask: %m");
3151 r = sigaction(SIGCHLD, &sa, NULL);
3153 log_error("Failed to install SIGCHLD handler: %m");
3157 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3158 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3159 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3161 if (errno == EINVAL)
3162 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3164 log_error("clone() failed: %m");
3172 _cleanup_free_ char *home = NULL;
3174 const char *envp[] = {
3175 "PATH=" DEFAULT_PATH_SPLIT_USR,
3176 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3181 NULL, /* container_uuid */
3182 NULL, /* LISTEN_FDS */
3183 NULL, /* LISTEN_PID */
3188 barrier_set_role(&barrier, BARRIER_CHILD);
3190 envp[n_env] = strv_find_prefix(environ, "TERM=");
3194 master = safe_close(master);
3196 close_nointr(STDIN_FILENO);
3197 close_nointr(STDOUT_FILENO);
3198 close_nointr(STDERR_FILENO);
3200 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3202 reset_all_signal_handlers();
3203 reset_signal_mask();
3205 k = open_terminal(console, O_RDWR);
3206 if (k != STDIN_FILENO) {
3212 log_error("Failed to open console: %s", strerror(-k));
3213 _exit(EXIT_FAILURE);
3216 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3217 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3218 log_error("Failed to duplicate console: %m");
3219 _exit(EXIT_FAILURE);
3223 log_error("setsid() failed: %m");
3224 _exit(EXIT_FAILURE);
3227 if (reset_audit_loginuid() < 0)
3228 _exit(EXIT_FAILURE);
3230 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3231 log_error("PR_SET_PDEATHSIG failed: %m");
3232 _exit(EXIT_FAILURE);
3235 /* Mark everything as slave, so that we still
3236 * receive mounts from the real root, but don't
3237 * propagate mounts to the real root. */
3238 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3239 log_error("MS_SLAVE|MS_REC failed: %m");
3240 _exit(EXIT_FAILURE);
3243 if (mount_devices(arg_directory,
3244 root_device, root_device_rw,
3245 home_device, home_device_rw,
3246 srv_device, srv_device_rw) < 0)
3247 _exit(EXIT_FAILURE);
3249 /* Turn directory into bind mount */
3250 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3251 log_error("Failed to make bind mount: %m");
3252 _exit(EXIT_FAILURE);
3255 r = setup_volatile(arg_directory);
3257 _exit(EXIT_FAILURE);
3259 if (setup_volatile_state(arg_directory) < 0)
3260 _exit(EXIT_FAILURE);
3262 r = base_filesystem_create(arg_directory);
3264 _exit(EXIT_FAILURE);
3266 if (arg_read_only) {
3267 k = bind_remount_recursive(arg_directory, true);
3269 log_error("Failed to make tree read-only: %s", strerror(-k));
3270 _exit(EXIT_FAILURE);
3274 if (mount_all(arg_directory) < 0)
3275 _exit(EXIT_FAILURE);
3277 if (copy_devnodes(arg_directory) < 0)
3278 _exit(EXIT_FAILURE);
3280 if (setup_ptmx(arg_directory) < 0)
3281 _exit(EXIT_FAILURE);
3283 dev_setup(arg_directory);
3285 if (setup_seccomp() < 0)
3286 _exit(EXIT_FAILURE);
3288 if (setup_dev_console(arg_directory, console) < 0)
3289 _exit(EXIT_FAILURE);
3291 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3292 _exit(EXIT_FAILURE);
3294 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3296 if (setup_boot_id(arg_directory) < 0)
3297 _exit(EXIT_FAILURE);
3299 if (setup_timezone(arg_directory) < 0)
3300 _exit(EXIT_FAILURE);
3302 if (setup_resolv_conf(arg_directory) < 0)
3303 _exit(EXIT_FAILURE);
3305 if (setup_journal(arg_directory) < 0)
3306 _exit(EXIT_FAILURE);
3308 if (mount_binds(arg_directory, arg_bind, false) < 0)
3309 _exit(EXIT_FAILURE);
3311 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3312 _exit(EXIT_FAILURE);
3314 if (mount_tmpfs(arg_directory) < 0)
3315 _exit(EXIT_FAILURE);
3317 /* Tell the parent that we are ready, and that
3318 * it can cgroupify us to that we lack access
3319 * to certain devices and resources. */
3320 (void)barrier_place(&barrier);
3322 if (chdir(arg_directory) < 0) {
3323 log_error("chdir(%s) failed: %m", arg_directory);
3324 _exit(EXIT_FAILURE);
3327 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3328 log_error("mount(MS_MOVE) failed: %m");
3329 _exit(EXIT_FAILURE);
3332 if (chroot(".") < 0) {
3333 log_error("chroot() failed: %m");
3334 _exit(EXIT_FAILURE);
3337 if (chdir("/") < 0) {
3338 log_error("chdir() failed: %m");
3339 _exit(EXIT_FAILURE);
3344 if (arg_private_network)
3347 if (drop_capabilities() < 0) {
3348 log_error("drop_capabilities() failed: %m");
3349 _exit(EXIT_FAILURE);
3352 r = change_uid_gid(&home);
3354 _exit(EXIT_FAILURE);
3356 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3357 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3358 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3360 _exit(EXIT_FAILURE);
3363 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3366 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3368 _exit(EXIT_FAILURE);
3372 if (fdset_size(fds) > 0) {
3373 k = fdset_cloexec(fds, false);
3375 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3376 _exit(EXIT_FAILURE);
3379 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3380 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3382 _exit(EXIT_FAILURE);
3388 if (arg_personality != 0xffffffffLU) {
3389 if (personality(arg_personality) < 0) {
3390 log_error("personality() failed: %m");
3391 _exit(EXIT_FAILURE);
3393 } else if (secondary) {
3394 if (personality(PER_LINUX32) < 0) {
3395 log_error("personality() failed: %m");
3396 _exit(EXIT_FAILURE);
3401 if (arg_selinux_context)
3402 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3403 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3404 _exit(EXIT_FAILURE);
3408 if (!strv_isempty(arg_setenv)) {
3411 n = strv_env_merge(2, envp, arg_setenv);
3414 _exit(EXIT_FAILURE);
3419 env_use = (char**) envp;
3421 /* Wait until the parent is ready with the setup, too... */
3422 if (!barrier_place_and_sync(&barrier))
3423 _exit(EXIT_FAILURE);
3429 /* Automatically search for the init system */
3431 l = 1 + argc - optind;
3432 a = newa(char*, l + 1);
3433 memcpy(a + 1, argv + optind, l * sizeof(char*));
3435 a[0] = (char*) "/usr/lib/systemd/systemd";
3436 execve(a[0], a, env_use);
3438 a[0] = (char*) "/lib/systemd/systemd";
3439 execve(a[0], a, env_use);
3441 a[0] = (char*) "/sbin/init";
3442 execve(a[0], a, env_use);
3443 } else if (argc > optind)
3444 execvpe(argv[optind], argv + optind, env_use);
3446 chdir(home ? home : "/root");
3447 execle("/bin/bash", "-bash", NULL, env_use);
3448 execle("/bin/sh", "-sh", NULL, env_use);
3451 log_error("execv() failed: %m");
3452 _exit(EXIT_FAILURE);
3455 barrier_set_role(&barrier, BARRIER_PARENT);
3459 /* wait for child-setup to be done */
3460 if (barrier_place_and_sync(&barrier)) {
3461 _cleanup_event_unref_ sd_event *event = NULL;
3462 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3465 r = move_network_interfaces(pid);
3469 r = setup_veth(pid, veth_name, &ifi);
3473 r = setup_bridge(veth_name, &ifi);
3477 r = setup_macvlan(pid);
3481 r = register_machine(pid, ifi);
3485 /* Block SIGCHLD here, before notifying child.
3486 * process_pty() will handle it with the other signals. */
3487 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3491 /* Reset signal to default */
3492 r = default_signals(SIGCHLD, -1);
3496 /* Notify the child that the parent is ready with all
3497 * its setup, and that the child can now hand over
3498 * control to the code to run inside the container. */
3499 (void)barrier_place(&barrier);
3501 r = sd_event_new(&event);
3503 log_error("Failed to get default event source: %s", strerror(-r));
3508 /* Try to kill the init system on SIGINT or SIGTERM */
3509 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3510 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3512 /* Immediately exit */
3513 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3514 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3517 /* simply exit on sigchld */
3518 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3520 r = pty_forward_new(event, master, &forward);
3522 log_error("Failed to create PTY forwarder: %s", strerror(-r));
3526 r = sd_event_loop(event);
3528 log_error("Failed to run event loop: %s", strerror(-r));
3532 forward = pty_forward_free(forward);
3537 /* Kill if it is not dead yet anyway */
3538 terminate_machine(pid);
3541 /* Normally redundant, but better safe than sorry */
3544 r = wait_for_container(pid, &container_status);
3548 /* We failed to wait for the container, or the
3549 * container exited abnormally */
3552 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3553 /* The container exited with a non-zero
3554 * status, or with zero status and no reboot
3558 /* CONTAINER_REBOOTED, loop again */
3560 if (arg_keep_unit) {
3561 /* Special handling if we are running as a
3562 * service: instead of simply restarting the
3563 * machine we want to restart the entire
3564 * service, so let's inform systemd about this
3565 * with the special exit code 133. The service
3566 * file uses RestartForceExitStatus=133 so
3567 * that this results in a full nspawn
3568 * restart. This is necessary since we might
3569 * have cgroup parameters set we want to have
3579 "STATUS=Terminating...");
3581 loop_remove(loop_nr, &image_fd);
3586 free(arg_directory);
3589 strv_free(arg_setenv);
3590 strv_free(arg_network_interfaces);
3591 strv_free(arg_network_macvlan);
3592 strv_free(arg_bind);
3593 strv_free(arg_bind_ro);
3594 strv_free(arg_tmpfs);