1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
49 #include <selinux/selinux.h>
57 #include <blkid/blkid.h>
60 #include "sd-daemon.h"
70 #include "cgroup-util.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
79 #include "bus-error.h"
81 #include "bus-kernel.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
88 #include "siphash24.h"
90 #include "base-filesystem.h"
92 #include "event-util.h"
95 #include "seccomp-util.h"
98 typedef enum ContainerStatus {
103 typedef enum LinkJournal {
110 typedef enum Volatile {
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static uint64_t arg_retain =
128 (1ULL << CAP_CHOWN) |
129 (1ULL << CAP_DAC_OVERRIDE) |
130 (1ULL << CAP_DAC_READ_SEARCH) |
131 (1ULL << CAP_FOWNER) |
132 (1ULL << CAP_FSETID) |
133 (1ULL << CAP_IPC_OWNER) |
135 (1ULL << CAP_LEASE) |
136 (1ULL << CAP_LINUX_IMMUTABLE) |
137 (1ULL << CAP_NET_BIND_SERVICE) |
138 (1ULL << CAP_NET_BROADCAST) |
139 (1ULL << CAP_NET_RAW) |
140 (1ULL << CAP_SETGID) |
141 (1ULL << CAP_SETFCAP) |
142 (1ULL << CAP_SETPCAP) |
143 (1ULL << CAP_SETUID) |
144 (1ULL << CAP_SYS_ADMIN) |
145 (1ULL << CAP_SYS_CHROOT) |
146 (1ULL << CAP_SYS_NICE) |
147 (1ULL << CAP_SYS_PTRACE) |
148 (1ULL << CAP_SYS_TTY_CONFIG) |
149 (1ULL << CAP_SYS_RESOURCE) |
150 (1ULL << CAP_SYS_BOOT) |
151 (1ULL << CAP_AUDIT_WRITE) |
152 (1ULL << CAP_AUDIT_CONTROL) |
154 static char **arg_bind = NULL;
155 static char **arg_bind_ro = NULL;
156 static char **arg_tmpfs = NULL;
157 static char **arg_setenv = NULL;
158 static bool arg_quiet = false;
159 static bool arg_share_system = false;
160 static bool arg_register = true;
161 static bool arg_keep_unit = false;
162 static char **arg_network_interfaces = NULL;
163 static char **arg_network_macvlan = NULL;
164 static bool arg_network_veth = false;
165 static const char *arg_network_bridge = NULL;
166 static unsigned long arg_personality = 0xffffffffLU;
167 static const char *arg_image = NULL;
168 static Volatile arg_volatile = VOLATILE_NO;
170 static void help(void) {
171 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
172 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
173 " -h --help Show this help\n"
174 " --version Print version string\n"
175 " -q --quiet Do not show status information\n"
176 " -D --directory=PATH Root directory for the container\n"
177 " -i --image=PATH File system device or image for the container\n"
178 " -b --boot Boot up full system (i.e. invoke init)\n"
179 " -u --user=USER Run the command under specified user or uid\n"
180 " -M --machine=NAME Set the machine name for the container\n"
181 " --uuid=UUID Set a specific machine UUID for the container\n"
182 " -S --slice=SLICE Place the container in the specified slice\n"
183 " --private-network Disable network in container\n"
184 " --network-interface=INTERFACE\n"
185 " Assign an existing network interface to the\n"
187 " --network-macvlan=INTERFACE\n"
188 " Create a macvlan network interface based on an\n"
189 " existing network interface to the container\n"
190 " --network-veth Add a virtual ethernet connection between host\n"
192 " --network-bridge=INTERFACE\n"
193 " Add a virtual ethernet connection between host\n"
194 " and container and add it to an existing bridge on\n"
196 " -Z --selinux-context=SECLABEL\n"
197 " Set the SELinux security context to be used by\n"
198 " processes in the container\n"
199 " -L --selinux-apifs-context=SECLABEL\n"
200 " Set the SELinux security context to be used by\n"
201 " API/tmpfs file systems in the container\n"
202 " --capability=CAP In addition to the default, retain specified\n"
204 " --drop-capability=CAP Drop the specified capability from the default set\n"
205 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
206 " -j Equivalent to --link-journal=host\n"
207 " --read-only Mount the root directory read-only\n"
208 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
210 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
211 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
212 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
213 " --share-system Share system namespaces with host\n"
214 " --register=BOOLEAN Register container as machine\n"
215 " --keep-unit Do not register a scope for the machine, reuse\n"
216 " the service unit nspawn is running in\n"
217 " --volatile[=MODE] Run the system in volatile mode\n",
218 program_invocation_short_name);
221 static int parse_argv(int argc, char *argv[]) {
238 ARG_NETWORK_INTERFACE,
246 static const struct option options[] = {
247 { "help", no_argument, NULL, 'h' },
248 { "version", no_argument, NULL, ARG_VERSION },
249 { "directory", required_argument, NULL, 'D' },
250 { "user", required_argument, NULL, 'u' },
251 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
252 { "boot", no_argument, NULL, 'b' },
253 { "uuid", required_argument, NULL, ARG_UUID },
254 { "read-only", no_argument, NULL, ARG_READ_ONLY },
255 { "capability", required_argument, NULL, ARG_CAPABILITY },
256 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
257 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
258 { "bind", required_argument, NULL, ARG_BIND },
259 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
260 { "tmpfs", required_argument, NULL, ARG_TMPFS },
261 { "machine", required_argument, NULL, 'M' },
262 { "slice", required_argument, NULL, 'S' },
263 { "setenv", required_argument, NULL, ARG_SETENV },
264 { "selinux-context", required_argument, NULL, 'Z' },
265 { "selinux-apifs-context", required_argument, NULL, 'L' },
266 { "quiet", no_argument, NULL, 'q' },
267 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
268 { "register", required_argument, NULL, ARG_REGISTER },
269 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
270 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
271 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
272 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
273 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
274 { "personality", required_argument, NULL, ARG_PERSONALITY },
275 { "image", required_argument, NULL, 'i' },
276 { "volatile", optional_argument, NULL, ARG_VOLATILE },
281 uint64_t plus = 0, minus = 0;
286 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
295 puts(PACKAGE_STRING);
296 puts(SYSTEMD_FEATURES);
301 arg_directory = canonicalize_file_name(optarg);
302 if (!arg_directory) {
303 log_error("Invalid root directory: %m");
315 arg_user = strdup(optarg);
321 case ARG_NETWORK_BRIDGE:
322 arg_network_bridge = optarg;
326 case ARG_NETWORK_VETH:
327 arg_network_veth = true;
328 arg_private_network = true;
331 case ARG_NETWORK_INTERFACE:
332 if (strv_extend(&arg_network_interfaces, optarg) < 0)
335 arg_private_network = true;
338 case ARG_NETWORK_MACVLAN:
339 if (strv_extend(&arg_network_macvlan, optarg) < 0)
344 case ARG_PRIVATE_NETWORK:
345 arg_private_network = true;
353 r = sd_id128_from_string(optarg, &arg_uuid);
355 log_error("Invalid UUID: %s", optarg);
365 if (isempty(optarg)) {
370 if (!hostname_is_valid(optarg)) {
371 log_error("Invalid machine name: %s", optarg);
376 arg_machine = strdup(optarg);
384 arg_selinux_context = optarg;
388 arg_selinux_apifs_context = optarg;
392 arg_read_only = true;
396 case ARG_DROP_CAPABILITY: {
397 const char *state, *word;
400 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
401 _cleanup_free_ char *t;
404 t = strndup(word, length);
408 if (streq(t, "all")) {
409 if (c == ARG_CAPABILITY)
410 plus = (uint64_t) -1;
412 minus = (uint64_t) -1;
414 if (cap_from_name(t, &cap) < 0) {
415 log_error("Failed to parse capability %s.", t);
419 if (c == ARG_CAPABILITY)
420 plus |= 1ULL << (uint64_t) cap;
422 minus |= 1ULL << (uint64_t) cap;
430 arg_link_journal = LINK_GUEST;
433 case ARG_LINK_JOURNAL:
434 if (streq(optarg, "auto"))
435 arg_link_journal = LINK_AUTO;
436 else if (streq(optarg, "no"))
437 arg_link_journal = LINK_NO;
438 else if (streq(optarg, "guest"))
439 arg_link_journal = LINK_GUEST;
440 else if (streq(optarg, "host"))
441 arg_link_journal = LINK_HOST;
443 log_error("Failed to parse link journal mode %s", optarg);
451 _cleanup_free_ char *a = NULL, *b = NULL;
455 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
457 e = strchr(optarg, ':');
459 a = strndup(optarg, e - optarg);
469 if (!path_is_absolute(a) || !path_is_absolute(b)) {
470 log_error("Invalid bind mount specification: %s", optarg);
474 r = strv_extend(x, a);
478 r = strv_extend(x, b);
486 _cleanup_free_ char *a = NULL, *b = NULL;
489 e = strchr(optarg, ':');
491 a = strndup(optarg, e - optarg);
495 b = strdup("mode=0755");
501 if (!path_is_absolute(a)) {
502 log_error("Invalid tmpfs specification: %s", optarg);
506 r = strv_push(&arg_tmpfs, a);
512 r = strv_push(&arg_tmpfs, b);
524 if (!env_assignment_is_valid(optarg)) {
525 log_error("Environment variable assignment '%s' is not valid.", optarg);
529 n = strv_env_set(arg_setenv, optarg);
533 strv_free(arg_setenv);
542 case ARG_SHARE_SYSTEM:
543 arg_share_system = true;
547 r = parse_boolean(optarg);
549 log_error("Failed to parse --register= argument: %s", optarg);
557 arg_keep_unit = true;
560 case ARG_PERSONALITY:
562 arg_personality = personality_from_string(optarg);
563 if (arg_personality == 0xffffffffLU) {
564 log_error("Unknown or unsupported personality '%s'.", optarg);
573 arg_volatile = VOLATILE_YES;
575 r = parse_boolean(optarg);
577 if (streq(optarg, "state"))
578 arg_volatile = VOLATILE_STATE;
580 log_error("Failed to parse --volatile= argument: %s", optarg);
584 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
593 assert_not_reached("Unhandled option");
596 if (arg_share_system)
597 arg_register = false;
599 if (arg_boot && arg_share_system) {
600 log_error("--boot and --share-system may not be combined.");
604 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
605 log_error("--keep-unit may not be used when invoked from a user session.");
609 if (arg_directory && arg_image) {
610 log_error("--directory= and --image= may not be combined.");
614 if (arg_volatile != VOLATILE_NO && arg_read_only) {
615 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
619 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
624 static int mount_all(const char *dest) {
626 typedef struct MountPoint {
635 static const MountPoint mount_table[] = {
636 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
637 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
638 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
639 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
640 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
641 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
642 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
643 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
645 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
646 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
653 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
654 _cleanup_free_ char *where = NULL;
656 _cleanup_free_ char *options = NULL;
661 where = strjoin(dest, "/", mount_table[k].where, NULL);
665 t = path_is_mount_point(where, true);
667 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
675 /* Skip this entry if it is not a remount. */
676 if (mount_table[k].what && t > 0)
679 t = mkdir_p(where, 0755);
681 if (mount_table[k].fatal) {
682 log_error("Failed to create directory %s: %s", where, strerror(-t));
687 log_warning("Failed to create directory %s: %s", where, strerror(-t));
693 if (arg_selinux_apifs_context &&
694 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
695 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
702 o = mount_table[k].options;
705 if (mount(mount_table[k].what,
708 mount_table[k].flags,
711 if (mount_table[k].fatal) {
712 log_error("mount(%s) failed: %m", where);
717 log_warning("mount(%s) failed: %m", where);
724 static int mount_binds(const char *dest, char **l, bool ro) {
727 STRV_FOREACH_PAIR(x, y, l) {
728 _cleanup_free_ char *where = NULL;
729 struct stat source_st, dest_st;
732 if (stat(*x, &source_st) < 0) {
733 log_error("Failed to stat %s: %m", *x);
737 where = strappend(dest, *y);
741 r = stat(where, &dest_st);
743 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
744 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
747 } else if (errno == ENOENT) {
748 r = mkdir_parents_label(where, 0755);
750 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
754 log_error("Failed to bind mount %s: %m", *x);
758 /* Create the mount point, but be conservative -- refuse to create block
759 * and char devices. */
760 if (S_ISDIR(source_st.st_mode)) {
761 r = mkdir_label(where, 0755);
762 if (r < 0 && errno != EEXIST) {
763 log_error("Failed to create mount point %s: %s", where, strerror(-r));
767 } else if (S_ISFIFO(source_st.st_mode)) {
768 r = mkfifo(where, 0644);
769 if (r < 0 && errno != EEXIST) {
770 log_error("Failed to create mount point %s: %m", where);
774 } else if (S_ISSOCK(source_st.st_mode)) {
775 r = mknod(where, 0644 | S_IFSOCK, 0);
776 if (r < 0 && errno != EEXIST) {
777 log_error("Failed to create mount point %s: %m", where);
781 } else if (S_ISREG(source_st.st_mode)) {
784 log_error("Failed to create mount point %s: %s", where, strerror(-r));
789 log_error("Refusing to create mountpoint for file: %s", *x);
793 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
794 log_error("mount(%s) failed: %m", where);
799 r = bind_remount_recursive(where, true);
801 log_error("Read-Only bind mount failed: %s", strerror(-r));
810 static int mount_tmpfs(const char *dest) {
813 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
814 _cleanup_free_ char *where = NULL;
817 where = strappend(dest, *i);
821 r = mkdir_label(where, 0755);
823 log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
828 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
829 log_error("tmpfs mount to %s failed: %m", where);
837 static int setup_timezone(const char *dest) {
838 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
844 /* Fix the timezone, if possible */
845 r = readlink_malloc("/etc/localtime", &p);
847 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
851 z = path_startswith(p, "../usr/share/zoneinfo/");
853 z = path_startswith(p, "/usr/share/zoneinfo/");
855 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
859 where = strappend(dest, "/etc/localtime");
863 r = readlink_malloc(where, &q);
865 y = path_startswith(q, "../usr/share/zoneinfo/");
867 y = path_startswith(q, "/usr/share/zoneinfo/");
869 /* Already pointing to the right place? Then do nothing .. */
870 if (y && streq(y, z))
874 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
878 if (access(check, F_OK) < 0) {
879 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
883 what = strappend("../usr/share/zoneinfo/", z);
887 r = mkdir_parents(where, 0755);
889 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
895 if (r < 0 && errno != ENOENT) {
896 log_error("Failed to remove existing timezone info %s in container: %m", where);
901 if (symlink(what, where) < 0) {
902 log_error("Failed to correct timezone of container: %m");
909 static int setup_resolv_conf(const char *dest) {
910 _cleanup_free_ char *where = NULL;
915 if (arg_private_network)
918 /* Fix resolv.conf, if possible */
919 where = strappend(dest, "/etc/resolv.conf");
923 /* We don't really care for the results of this really. If it
924 * fails, it fails, but meh... */
925 r = mkdir_parents(where, 0755);
927 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
932 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
934 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
942 static int setup_volatile_state(const char *directory) {
948 if (arg_volatile != VOLATILE_STATE)
951 /* --volatile=state means we simply overmount /var
952 with a tmpfs, and the rest read-only. */
954 r = bind_remount_recursive(directory, true);
956 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
960 p = strappenda(directory, "/var");
962 if (r < 0 && errno != EEXIST) {
963 log_error("Failed to create %s: %m", directory);
967 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
968 log_error("Failed to mount tmpfs to /var: %m");
975 static int setup_volatile(const char *directory) {
976 bool tmpfs_mounted = false, bind_mounted = false;
977 char template[] = "/tmp/nspawn-volatile-XXXXXX";
983 if (arg_volatile != VOLATILE_YES)
986 /* --volatile=yes means we mount a tmpfs to the root dir, and
987 the original /usr to use inside it, and that read-only. */
989 if (!mkdtemp(template)) {
990 log_error("Failed to create temporary directory: %m");
994 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
995 log_error("Failed to mount tmpfs for root directory: %m");
1000 tmpfs_mounted = true;
1002 f = strappenda(directory, "/usr");
1003 t = strappenda(template, "/usr");
1006 if (r < 0 && errno != EEXIST) {
1007 log_error("Failed to create %s: %m", t);
1012 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1013 log_error("Failed to create /usr bind mount: %m");
1018 bind_mounted = true;
1020 r = bind_remount_recursive(t, true);
1022 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1026 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1027 log_error("Failed to move root mount: %m");
1045 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1048 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1049 SD_ID128_FORMAT_VAL(id));
1054 static int setup_boot_id(const char *dest) {
1055 _cleanup_free_ char *from = NULL, *to = NULL;
1056 sd_id128_t rnd = {};
1062 if (arg_share_system)
1065 /* Generate a new randomized boot ID, so that each boot-up of
1066 * the container gets a new one */
1068 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1069 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1073 r = sd_id128_randomize(&rnd);
1075 log_error("Failed to generate random boot id: %s", strerror(-r));
1079 id128_format_as_uuid(rnd, as_uuid);
1081 r = write_string_file(from, as_uuid);
1083 log_error("Failed to write boot id: %s", strerror(-r));
1087 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1088 log_error("Failed to bind mount boot id: %m");
1090 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1091 log_warning("Failed to make boot id read-only: %m");
1097 static int copy_devnodes(const char *dest) {
1099 static const char devnodes[] =
1110 _cleanup_umask_ mode_t u;
1116 NULSTR_FOREACH(d, devnodes) {
1117 _cleanup_free_ char *from = NULL, *to = NULL;
1120 from = strappend("/dev/", d);
1121 to = strjoin(dest, "/dev/", d, NULL);
1125 if (stat(from, &st) < 0) {
1127 if (errno != ENOENT) {
1128 log_error("Failed to stat %s: %m", from);
1132 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1134 log_error("%s is not a char or block device, cannot copy", from);
1138 r = mkdir_parents(to, 0775);
1140 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1144 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1145 log_error("mknod(%s) failed: %m", dest);
1154 static int setup_ptmx(const char *dest) {
1155 _cleanup_free_ char *p = NULL;
1157 p = strappend(dest, "/dev/ptmx");
1161 if (symlink("pts/ptmx", p) < 0) {
1162 log_error("Failed to create /dev/ptmx symlink: %m");
1169 static int setup_dev_console(const char *dest, const char *console) {
1170 _cleanup_umask_ mode_t u;
1180 if (stat("/dev/null", &st) < 0) {
1181 log_error("Failed to stat /dev/null: %m");
1185 r = chmod_and_chown(console, 0600, 0, 0);
1187 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1191 /* We need to bind mount the right tty to /dev/console since
1192 * ptys can only exist on pts file systems. To have something
1193 * to bind mount things on we create a device node first, and
1194 * use /dev/null for that since we the cgroups device policy
1195 * allows us to create that freely, while we cannot create
1196 * /dev/console. (Note that the major minor doesn't actually
1197 * matter here, since we mount it over anyway). */
1199 to = strappenda(dest, "/dev/console");
1200 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1201 log_error("mknod() for /dev/console failed: %m");
1205 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1206 log_error("Bind mount for /dev/console failed: %m");
1213 static int setup_kmsg(const char *dest, int kmsg_socket) {
1214 _cleanup_free_ char *from = NULL, *to = NULL;
1216 _cleanup_umask_ mode_t u;
1218 struct cmsghdr cmsghdr;
1219 uint8_t buf[CMSG_SPACE(sizeof(int))];
1221 struct msghdr mh = {
1222 .msg_control = &control,
1223 .msg_controllen = sizeof(control),
1225 struct cmsghdr *cmsg;
1228 assert(kmsg_socket >= 0);
1232 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1233 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1234 * on the reading side behave very similar to /proc/kmsg,
1235 * their writing side behaves differently from /dev/kmsg in
1236 * that writing blocks when nothing is reading. In order to
1237 * avoid any problems with containers deadlocking due to this
1238 * we simply make /dev/kmsg unavailable to the container. */
1239 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1240 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1243 if (mkfifo(from, 0600) < 0) {
1244 log_error("mkfifo() for /dev/kmsg failed: %m");
1248 r = chmod_and_chown(from, 0600, 0, 0);
1250 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1254 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1255 log_error("Bind mount for /proc/kmsg failed: %m");
1259 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1261 log_error("Failed to open fifo: %m");
1265 cmsg = CMSG_FIRSTHDR(&mh);
1266 cmsg->cmsg_level = SOL_SOCKET;
1267 cmsg->cmsg_type = SCM_RIGHTS;
1268 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1269 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1271 mh.msg_controllen = cmsg->cmsg_len;
1273 /* Store away the fd in the socket, so that it stays open as
1274 * long as we run the child */
1275 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1279 log_error("Failed to send FIFO fd: %m");
1283 /* And now make the FIFO unavailable as /dev/kmsg... */
1288 static int setup_hostname(void) {
1290 if (arg_share_system)
1293 if (sethostname_idempotent(arg_machine) < 0)
1299 static int setup_journal(const char *directory) {
1300 sd_id128_t machine_id, this_id;
1301 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1305 p = strappend(directory, "/etc/machine-id");
1309 r = read_one_line_file(p, &b);
1310 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1313 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1318 if (isempty(id) && arg_link_journal == LINK_AUTO)
1321 /* Verify validity */
1322 r = sd_id128_from_string(id, &machine_id);
1324 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1328 r = sd_id128_get_machine(&this_id);
1330 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1334 if (sd_id128_equal(machine_id, this_id)) {
1335 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1336 "Host and machine ids are equal (%s): refusing to link journals", id);
1337 if (arg_link_journal == LINK_AUTO)
1343 if (arg_link_journal == LINK_NO)
1347 p = strappend("/var/log/journal/", id);
1348 q = strjoin(directory, "/var/log/journal/", id, NULL);
1352 if (path_is_mount_point(p, false) > 0) {
1353 if (arg_link_journal != LINK_AUTO) {
1354 log_error("%s: already a mount point, refusing to use for journal", p);
1361 if (path_is_mount_point(q, false) > 0) {
1362 if (arg_link_journal != LINK_AUTO) {
1363 log_error("%s: already a mount point, refusing to use for journal", q);
1370 r = readlink_and_make_absolute(p, &d);
1372 if ((arg_link_journal == LINK_GUEST ||
1373 arg_link_journal == LINK_AUTO) &&
1376 r = mkdir_p(q, 0755);
1378 log_warning("Failed to create directory %s: %m", q);
1382 if (unlink(p) < 0) {
1383 log_error("Failed to remove symlink %s: %m", p);
1386 } else if (r == -EINVAL) {
1388 if (arg_link_journal == LINK_GUEST &&
1391 if (errno == ENOTDIR) {
1392 log_error("%s already exists and is neither a symlink nor a directory", p);
1395 log_error("Failed to remove %s: %m", p);
1399 } else if (r != -ENOENT) {
1400 log_error("readlink(%s) failed: %m", p);
1404 if (arg_link_journal == LINK_GUEST) {
1406 if (symlink(q, p) < 0) {
1407 log_error("Failed to symlink %s to %s: %m", q, p);
1411 r = mkdir_p(q, 0755);
1413 log_warning("Failed to create directory %s: %m", q);
1417 if (arg_link_journal == LINK_HOST) {
1418 r = mkdir_p(p, 0755);
1420 log_error("Failed to create %s: %m", p);
1424 } else if (access(p, F_OK) < 0)
1427 if (dir_is_empty(q) == 0)
1428 log_warning("%s is not empty, proceeding anyway.", q);
1430 r = mkdir_p(q, 0755);
1432 log_error("Failed to create %s: %m", q);
1436 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1437 log_error("Failed to bind mount journal from host into guest: %m");
1444 static int setup_kdbus(const char *dest, const char *path) {
1450 p = strappenda(dest, "/dev/kdbus");
1451 if (mkdir(p, 0755) < 0) {
1452 log_error("Failed to create kdbus path: %m");
1456 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1457 log_error("Failed to mount kdbus domain path: %m");
1464 static int drop_capabilities(void) {
1465 return capability_bounding_set_drop(~arg_retain, false);
1468 static int register_machine(pid_t pid, int local_ifindex) {
1469 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1470 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1476 r = sd_bus_default_system(&bus);
1478 log_error("Failed to open system bus: %s", strerror(-r));
1482 if (arg_keep_unit) {
1483 r = sd_bus_call_method(
1485 "org.freedesktop.machine1",
1486 "/org/freedesktop/machine1",
1487 "org.freedesktop.machine1.Manager",
1488 "RegisterMachineWithNetwork",
1493 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1497 strempty(arg_directory),
1498 local_ifindex > 0 ? 1 : 0, local_ifindex);
1500 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1502 r = sd_bus_message_new_method_call(
1505 "org.freedesktop.machine1",
1506 "/org/freedesktop/machine1",
1507 "org.freedesktop.machine1.Manager",
1508 "CreateMachineWithNetwork");
1510 log_error("Failed to create message: %s", strerror(-r));
1514 r = sd_bus_message_append(
1518 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1522 strempty(arg_directory),
1523 local_ifindex > 0 ? 1 : 0, local_ifindex);
1525 log_error("Failed to append message arguments: %s", strerror(-r));
1529 r = sd_bus_message_open_container(m, 'a', "(sv)");
1531 log_error("Failed to open container: %s", strerror(-r));
1535 if (!isempty(arg_slice)) {
1536 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1538 log_error("Failed to append slice: %s", strerror(-r));
1543 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1545 log_error("Failed to add device policy: %s", strerror(-r));
1549 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 11,
1550 /* Allow the container to
1551 * access and create the API
1552 * device nodes, so that
1553 * PrivateDevices= in the
1554 * container can work
1559 "/dev/random", "rwm",
1560 "/dev/urandom", "rwm",
1562 "/dev/net/tun", "rwm",
1563 /* Allow the container
1564 * access to ptys. However,
1566 * container to ever create
1567 * these device nodes. */
1568 "/dev/pts/ptmx", "rw",
1570 /* Allow the container
1571 * access to all kdbus
1572 * devices. Again, the
1573 * container cannot create
1574 * these nodes, only use
1575 * them. We use a pretty
1576 * open match here, so that
1577 * the kernel API can still
1580 "char-kdbus/*", "rw");
1582 log_error("Failed to add device whitelist: %s", strerror(-r));
1586 r = sd_bus_message_close_container(m);
1588 log_error("Failed to close container: %s", strerror(-r));
1592 r = sd_bus_call(bus, m, 0, &error, NULL);
1596 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1603 static int terminate_machine(pid_t pid) {
1604 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1605 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1606 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1613 r = sd_bus_default_system(&bus);
1615 log_error("Failed to open system bus: %s", strerror(-r));
1619 r = sd_bus_call_method(
1621 "org.freedesktop.machine1",
1622 "/org/freedesktop/machine1",
1623 "org.freedesktop.machine1.Manager",
1630 /* Note that the machine might already have been
1631 * cleaned up automatically, hence don't consider it a
1632 * failure if we cannot get the machine object. */
1633 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1637 r = sd_bus_message_read(reply, "o", &path);
1639 return bus_log_parse_error(r);
1641 r = sd_bus_call_method(
1643 "org.freedesktop.machine1",
1645 "org.freedesktop.machine1.Machine",
1651 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1658 static int reset_audit_loginuid(void) {
1659 _cleanup_free_ char *p = NULL;
1662 if (arg_share_system)
1665 r = read_one_line_file("/proc/self/loginuid", &p);
1669 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1673 /* Already reset? */
1674 if (streq(p, "4294967295"))
1677 r = write_string_file("/proc/self/loginuid", "4294967295");
1679 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1680 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1681 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1682 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1683 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1691 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1692 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1694 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1701 l = strlen(arg_machine);
1702 sz = sizeof(sd_id128_t) + l;
1705 /* fetch some persistent data unique to the host */
1706 r = sd_id128_get_machine((sd_id128_t*) v);
1710 /* combine with some data unique (on this host) to this
1711 * container instance */
1712 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1714 /* Let's hash the host machine ID plus the container name. We
1715 * use a fixed, but originally randomly created hash key here. */
1716 siphash24(result, v, sz, hash_key.bytes);
1718 assert_cc(ETH_ALEN <= sizeof(result));
1719 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1721 /* see eth_random_addr in the kernel */
1722 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1723 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1728 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1729 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1730 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1731 struct ether_addr mac_host, mac_container;
1734 if (!arg_private_network)
1737 if (!arg_network_veth)
1740 /* Use two different interface name prefixes depending whether
1741 * we are in bridge mode or not. */
1742 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1743 arg_network_bridge ? "vb" : "ve", arg_machine);
1745 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1747 log_error("Failed to generate predictable MAC address for container side");
1751 r = generate_mac(&mac_host, HOST_HASH_KEY);
1753 log_error("Failed to generate predictable MAC address for host side");
1757 r = sd_rtnl_open(&rtnl, 0);
1759 log_error("Failed to connect to netlink: %s", strerror(-r));
1763 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1765 log_error("Failed to allocate netlink message: %s", strerror(-r));
1769 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1771 log_error("Failed to add netlink interface name: %s", strerror(-r));
1775 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1777 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1781 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1783 log_error("Failed to open netlink container: %s", strerror(-r));
1787 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1789 log_error("Failed to open netlink container: %s", strerror(-r));
1793 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1795 log_error("Failed to open netlink container: %s", strerror(-r));
1799 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1801 log_error("Failed to add netlink interface name: %s", strerror(-r));
1805 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1807 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1811 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1813 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1817 r = sd_rtnl_message_close_container(m);
1819 log_error("Failed to close netlink container: %s", strerror(-r));
1823 r = sd_rtnl_message_close_container(m);
1825 log_error("Failed to close netlink container: %s", strerror(-r));
1829 r = sd_rtnl_message_close_container(m);
1831 log_error("Failed to close netlink container: %s", strerror(-r));
1835 r = sd_rtnl_call(rtnl, m, 0, NULL);
1837 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1841 i = (int) if_nametoindex(iface_name);
1843 log_error("Failed to resolve interface %s: %m", iface_name);
1852 static int setup_bridge(const char veth_name[], int *ifi) {
1853 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1854 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1857 if (!arg_private_network)
1860 if (!arg_network_veth)
1863 if (!arg_network_bridge)
1866 bridge = (int) if_nametoindex(arg_network_bridge);
1868 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1874 r = sd_rtnl_open(&rtnl, 0);
1876 log_error("Failed to connect to netlink: %s", strerror(-r));
1880 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1882 log_error("Failed to allocate netlink message: %s", strerror(-r));
1886 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1888 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1892 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1894 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1898 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1900 log_error("Failed to add netlink master field: %s", strerror(-r));
1904 r = sd_rtnl_call(rtnl, m, 0, NULL);
1906 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1913 static int parse_interface(struct udev *udev, const char *name) {
1914 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1915 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1918 ifi = (int) if_nametoindex(name);
1920 log_error("Failed to resolve interface %s: %m", name);
1924 sprintf(ifi_str, "n%i", ifi);
1925 d = udev_device_new_from_device_id(udev, ifi_str);
1927 log_error("Failed to get udev device for interface %s: %m", name);
1931 if (udev_device_get_is_initialized(d) <= 0) {
1932 log_error("Network interface %s is not initialized yet.", name);
1939 static int move_network_interfaces(pid_t pid) {
1940 _cleanup_udev_unref_ struct udev *udev = NULL;
1941 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1945 if (!arg_private_network)
1948 if (strv_isempty(arg_network_interfaces))
1951 r = sd_rtnl_open(&rtnl, 0);
1953 log_error("Failed to connect to netlink: %s", strerror(-r));
1959 log_error("Failed to connect to udev.");
1963 STRV_FOREACH(i, arg_network_interfaces) {
1964 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1967 ifi = parse_interface(udev, *i);
1971 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1973 log_error("Failed to allocate netlink message: %s", strerror(-r));
1977 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1979 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1983 r = sd_rtnl_call(rtnl, m, 0, NULL);
1985 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1993 static int setup_macvlan(pid_t pid) {
1994 _cleanup_udev_unref_ struct udev *udev = NULL;
1995 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1999 if (!arg_private_network)
2002 if (strv_isempty(arg_network_macvlan))
2005 r = sd_rtnl_open(&rtnl, 0);
2007 log_error("Failed to connect to netlink: %s", strerror(-r));
2013 log_error("Failed to connect to udev.");
2017 STRV_FOREACH(i, arg_network_macvlan) {
2018 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2019 _cleanup_free_ char *n = NULL;
2022 ifi = parse_interface(udev, *i);
2026 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2028 log_error("Failed to allocate netlink message: %s", strerror(-r));
2032 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2034 log_error("Failed to add netlink interface index: %s", strerror(-r));
2038 n = strappend("mv-", *i);
2042 strshorten(n, IFNAMSIZ-1);
2044 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2046 log_error("Failed to add netlink interface name: %s", strerror(-r));
2050 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2052 log_error("Failed to add netlink namespace field: %s", strerror(-r));
2056 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2058 log_error("Failed to open netlink container: %s", strerror(-r));
2062 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2064 log_error("Failed to open netlink container: %s", strerror(-r));
2068 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2070 log_error("Failed to append macvlan mode: %s", strerror(-r));
2074 r = sd_rtnl_message_close_container(m);
2076 log_error("Failed to close netlink container: %s", strerror(-r));
2080 r = sd_rtnl_message_close_container(m);
2082 log_error("Failed to close netlink container: %s", strerror(-r));
2086 r = sd_rtnl_call(rtnl, m, 0, NULL);
2088 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2096 static int setup_seccomp(void) {
2099 static const int blacklist[] = {
2100 SCMP_SYS(kexec_load),
2101 SCMP_SYS(open_by_handle_at),
2102 SCMP_SYS(init_module),
2103 SCMP_SYS(finit_module),
2104 SCMP_SYS(delete_module),
2111 scmp_filter_ctx seccomp;
2115 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2119 r = seccomp_add_secondary_archs(seccomp);
2121 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2125 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2126 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2128 continue; /* unknown syscall */
2130 log_error("Failed to block syscall: %s", strerror(-r));
2136 Audit is broken in containers, much of the userspace audit
2137 hookup will fail if running inside a container. We don't
2138 care and just turn off creation of audit sockets.
2140 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2141 with EAFNOSUPPORT which audit userspace uses as indication
2142 that audit is disabled in the kernel.
2145 r = seccomp_rule_add(
2147 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2150 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2151 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2153 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2157 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2159 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2163 r = seccomp_load(seccomp);
2165 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2168 seccomp_release(seccomp);
2176 static int setup_image(char **device_path, int *loop_nr) {
2177 struct loop_info64 info = {
2178 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2180 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2181 _cleanup_free_ char* loopdev = NULL;
2185 assert(device_path);
2188 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2190 log_error("Failed to open %s: %m", arg_image);
2194 if (fstat(fd, &st) < 0) {
2195 log_error("Failed to stat %s: %m", arg_image);
2199 if (S_ISBLK(st.st_mode)) {
2202 p = strdup(arg_image);
2216 if (!S_ISREG(st.st_mode)) {
2217 log_error("%s is not a regular file or block device: %m", arg_image);
2221 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2223 log_error("Failed to open /dev/loop-control: %m");
2227 nr = ioctl(control, LOOP_CTL_GET_FREE);
2229 log_error("Failed to allocate loop device: %m");
2233 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2236 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2238 log_error("Failed to open loop device %s: %m", loopdev);
2242 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2243 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2248 info.lo_flags |= LO_FLAGS_READ_ONLY;
2250 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2251 log_error("Failed to set loopback settings on %s: %m", loopdev);
2255 *device_path = loopdev;
2266 static int dissect_image(
2268 char **root_device, bool *root_device_rw,
2269 char **home_device, bool *home_device_rw,
2270 char **srv_device, bool *srv_device_rw,
2274 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2275 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2276 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2277 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2278 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2279 _cleanup_udev_unref_ struct udev *udev = NULL;
2280 struct udev_list_entry *first, *item;
2281 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2282 const char *pttype = NULL;
2288 assert(root_device);
2289 assert(home_device);
2293 b = blkid_new_probe();
2298 r = blkid_probe_set_device(b, fd, 0, 0);
2303 log_error("Failed to set device on blkid probe: %m");
2307 blkid_probe_enable_partitions(b, 1);
2308 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2311 r = blkid_do_safeprobe(b);
2312 if (r == -2 || r == 1) {
2313 log_error("Failed to identify any partition table on %s.\n"
2314 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2316 } else if (r != 0) {
2319 log_error("Failed to probe: %m");
2323 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2324 if (!streq_ptr(pttype, "gpt")) {
2325 log_error("Image %s does not carry a GUID Partition Table.\n"
2326 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2331 pl = blkid_probe_get_partitions(b);
2336 log_error("Failed to list partitions of %s", arg_image);
2344 if (fstat(fd, &st) < 0) {
2345 log_error("Failed to stat block device: %m");
2349 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2353 e = udev_enumerate_new(udev);
2357 r = udev_enumerate_add_match_parent(e, d);
2361 r = udev_enumerate_scan_devices(e);
2363 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2367 first = udev_enumerate_get_list_entry(e);
2368 udev_list_entry_foreach(item, first) {
2369 _cleanup_udev_device_unref_ struct udev_device *q;
2370 const char *stype, *node;
2371 unsigned long long flags;
2378 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2383 log_error("Failed to get partition device of %s: %m", arg_image);
2387 qn = udev_device_get_devnum(q);
2391 if (st.st_rdev == qn)
2394 node = udev_device_get_devnode(q);
2398 pp = blkid_partlist_devno_to_partition(pl, qn);
2402 flags = blkid_partition_get_flags(pp);
2403 if (flags & GPT_FLAG_NO_AUTO)
2406 nr = blkid_partition_get_partno(pp);
2410 stype = blkid_partition_get_type_string(pp);
2414 if (sd_id128_from_string(stype, &type_id) < 0)
2417 if (sd_id128_equal(type_id, GPT_HOME)) {
2419 if (home && nr >= home_nr)
2423 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2426 home = strdup(node);
2429 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2431 if (srv && nr >= srv_nr)
2435 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2442 #ifdef GPT_ROOT_NATIVE
2443 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2445 if (root && nr >= root_nr)
2449 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2452 root = strdup(node);
2457 #ifdef GPT_ROOT_SECONDARY
2458 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2460 if (secondary_root && nr >= secondary_root_nr)
2463 secondary_root_nr = nr;
2464 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2467 free(secondary_root);
2468 secondary_root = strdup(node);
2469 if (!secondary_root)
2475 if (!root && !secondary_root) {
2476 log_error("Failed to identify root partition in disk image %s.\n"
2477 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2482 *root_device = root;
2485 *root_device_rw = root_rw;
2487 } else if (secondary_root) {
2488 *root_device = secondary_root;
2489 secondary_root = NULL;
2491 *root_device_rw = secondary_root_rw;
2496 *home_device = home;
2499 *home_device_rw = home_rw;
2506 *srv_device_rw = srv_rw;
2511 log_error("--image= is not supported, compiled without blkid support.");
2516 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2518 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2519 const char *fstype, *p;
2529 p = strappenda(where, directory);
2534 b = blkid_new_probe_from_filename(what);
2538 log_error("Failed to allocate prober for %s: %m", what);
2542 blkid_probe_enable_superblocks(b, 1);
2543 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2546 r = blkid_do_safeprobe(b);
2547 if (r == -1 || r == 1) {
2548 log_error("Cannot determine file system type of %s", what);
2550 } else if (r != 0) {
2553 log_error("Failed to probe %s: %m", what);
2558 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2561 log_error("Failed to determine file system type of %s", what);
2565 if (streq(fstype, "crypto_LUKS")) {
2566 log_error("nspawn currently does not support LUKS disk images.");
2570 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2571 log_error("Failed to mount %s: %m", what);
2577 log_error("--image= is not supported, compiled without blkid support.");
2582 static int mount_devices(
2584 const char *root_device, bool root_device_rw,
2585 const char *home_device, bool home_device_rw,
2586 const char *srv_device, bool srv_device_rw) {
2592 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2594 log_error("Failed to mount root directory: %s", strerror(-r));
2600 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2602 log_error("Failed to mount home directory: %s", strerror(-r));
2608 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2610 log_error("Failed to mount server data directory: %s", strerror(-r));
2618 static void loop_remove(int nr, int *image_fd) {
2619 _cleanup_close_ int control = -1;
2625 if (image_fd && *image_fd >= 0) {
2626 r = ioctl(*image_fd, LOOP_CLR_FD);
2628 log_warning("Failed to close loop image: %m");
2629 *image_fd = safe_close(*image_fd);
2632 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2634 log_warning("Failed to open /dev/loop-control: %m");
2638 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2640 log_warning("Failed to remove loop %d: %m", nr);
2643 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2651 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2652 log_error("Failed to allocate pipe: %m");
2658 log_error("Failed to fork getent child: %m");
2660 } else if (pid == 0) {
2662 char *empty_env = NULL;
2664 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2665 _exit(EXIT_FAILURE);
2667 if (pipe_fds[0] > 2)
2668 safe_close(pipe_fds[0]);
2669 if (pipe_fds[1] > 2)
2670 safe_close(pipe_fds[1]);
2672 nullfd = open("/dev/null", O_RDWR);
2674 _exit(EXIT_FAILURE);
2676 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2677 _exit(EXIT_FAILURE);
2679 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2680 _exit(EXIT_FAILURE);
2685 reset_all_signal_handlers();
2686 close_all_fds(NULL, 0);
2688 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2689 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2690 _exit(EXIT_FAILURE);
2693 pipe_fds[1] = safe_close(pipe_fds[1]);
2700 static int change_uid_gid(char **_home) {
2701 char line[LINE_MAX], *x, *u, *g, *h;
2702 const char *word, *state;
2703 _cleanup_free_ uid_t *uids = NULL;
2704 _cleanup_free_ char *home = NULL;
2705 _cleanup_fclose_ FILE *f = NULL;
2706 _cleanup_close_ int fd = -1;
2707 unsigned n_uids = 0;
2716 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2717 /* Reset everything fully to 0, just in case */
2719 if (setgroups(0, NULL) < 0) {
2720 log_error("setgroups() failed: %m");
2724 if (setresgid(0, 0, 0) < 0) {
2725 log_error("setregid() failed: %m");
2729 if (setresuid(0, 0, 0) < 0) {
2730 log_error("setreuid() failed: %m");
2738 /* First, get user credentials */
2739 fd = spawn_getent("passwd", arg_user, &pid);
2743 f = fdopen(fd, "r");
2748 if (!fgets(line, sizeof(line), f)) {
2751 log_error("Failed to resolve user %s.", arg_user);
2755 log_error("Failed to read from getent: %m");
2761 wait_for_terminate_and_warn("getent passwd", pid);
2763 x = strchr(line, ':');
2765 log_error("/etc/passwd entry has invalid user field.");
2769 u = strchr(x+1, ':');
2771 log_error("/etc/passwd entry has invalid password field.");
2778 log_error("/etc/passwd entry has invalid UID field.");
2786 log_error("/etc/passwd entry has invalid GID field.");
2791 h = strchr(x+1, ':');
2793 log_error("/etc/passwd entry has invalid GECOS field.");
2800 log_error("/etc/passwd entry has invalid home directory field.");
2806 r = parse_uid(u, &uid);
2808 log_error("Failed to parse UID of user.");
2812 r = parse_gid(g, &gid);
2814 log_error("Failed to parse GID of user.");
2822 /* Second, get group memberships */
2823 fd = spawn_getent("initgroups", arg_user, &pid);
2828 f = fdopen(fd, "r");
2833 if (!fgets(line, sizeof(line), f)) {
2835 log_error("Failed to resolve user %s.", arg_user);
2839 log_error("Failed to read from getent: %m");
2845 wait_for_terminate_and_warn("getent initgroups", pid);
2847 /* Skip over the username and subsequent separator whitespace */
2849 x += strcspn(x, WHITESPACE);
2850 x += strspn(x, WHITESPACE);
2852 FOREACH_WORD(word, l, x, state) {
2858 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2861 r = parse_uid(c, &uids[n_uids++]);
2863 log_error("Failed to parse group data from getent.");
2868 r = mkdir_parents(home, 0775);
2870 log_error("Failed to make home root directory: %s", strerror(-r));
2874 r = mkdir_safe(home, 0755, uid, gid);
2875 if (r < 0 && r != -EEXIST) {
2876 log_error("Failed to make home directory: %s", strerror(-r));
2880 fchown(STDIN_FILENO, uid, gid);
2881 fchown(STDOUT_FILENO, uid, gid);
2882 fchown(STDERR_FILENO, uid, gid);
2884 if (setgroups(n_uids, uids) < 0) {
2885 log_error("Failed to set auxiliary groups: %m");
2889 if (setresgid(gid, gid, gid) < 0) {
2890 log_error("setregid() failed: %m");
2894 if (setresuid(uid, uid, uid) < 0) {
2895 log_error("setreuid() failed: %m");
2909 * < 0 : wait_for_terminate() failed to get the state of the
2910 * container, the container was terminated by a signal, or
2911 * failed for an unknown reason. No change is made to the
2912 * container argument.
2913 * > 0 : The program executed in the container terminated with an
2914 * error. The exit code of the program executed in the
2915 * container is returned. The container argument has been set
2916 * to CONTAINER_TERMINATED.
2917 * 0 : The container is being rebooted, has been shut down or exited
2918 * successfully. The container argument has been set to either
2919 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2921 * That is, success is indicated by a return value of zero, and an
2922 * error is indicated by a non-zero value.
2924 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2928 r = wait_for_terminate(pid, &status);
2930 log_warning("Failed to wait for container: %s", strerror(-r));
2934 switch (status.si_code) {
2937 if (status.si_status == 0) {
2938 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2941 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2943 *container = CONTAINER_TERMINATED;
2944 return status.si_status;
2947 if (status.si_status == SIGINT) {
2949 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2950 *container = CONTAINER_TERMINATED;
2953 } else if (status.si_status == SIGHUP) {
2955 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2956 *container = CONTAINER_REBOOTED;
2960 /* CLD_KILLED fallthrough */
2963 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2967 log_error("Container %s failed due to unknown reason.", arg_machine);
2974 static void nop_handler(int sig) {}
2976 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2979 pid = PTR_TO_UINT32(userdata);
2981 if (kill(pid, SIGRTMIN+3) >= 0) {
2982 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2983 sd_event_source_set_userdata(s, NULL);
2988 sd_event_exit(sd_event_source_get_event(s), 0);
2992 int main(int argc, char *argv[]) {
2994 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2995 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2996 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2997 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2998 _cleanup_fdset_free_ FDSet *fds = NULL;
2999 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
3000 const char *console = NULL;
3001 char veth_name[IFNAMSIZ];
3002 bool secondary = false;
3003 sigset_t mask, mask_chld;
3006 log_parse_environment();
3009 k = parse_argv(argc, argv);
3018 if (arg_directory) {
3021 p = path_make_absolute_cwd(arg_directory);
3022 free(arg_directory);
3025 arg_directory = get_current_dir_name();
3027 if (!arg_directory) {
3028 log_error("Failed to determine path, please use -D.");
3031 path_kill_slashes(arg_directory);
3035 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3041 hostname_cleanup(arg_machine, false);
3042 if (isempty(arg_machine)) {
3043 log_error("Failed to determine machine name automatically, please use -M.");
3048 if (geteuid() != 0) {
3049 log_error("Need to be root.");
3053 if (sd_booted() <= 0) {
3054 log_error("Not running on a systemd system.");
3059 n_fd_passed = sd_listen_fds(false);
3060 if (n_fd_passed > 0) {
3061 k = fdset_new_listen_fds(&fds, false);
3063 log_error("Failed to collect file descriptors: %s", strerror(-k));
3067 fdset_close_others(fds);
3070 if (arg_directory) {
3071 if (path_equal(arg_directory, "/")) {
3072 log_error("Spawning container on root directory not supported.");
3077 if (path_is_os_tree(arg_directory) <= 0) {
3078 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3084 p = strappenda(arg_directory,
3085 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3086 if (access(p, F_OK) < 0) {
3087 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3093 char template[] = "/tmp/nspawn-root-XXXXXX";
3095 if (!mkdtemp(template)) {
3096 log_error("Failed to create temporary directory: %m");
3101 arg_directory = strdup(template);
3102 if (!arg_directory) {
3107 image_fd = setup_image(&device_path, &loop_nr);
3113 r = dissect_image(image_fd,
3114 &root_device, &root_device_rw,
3115 &home_device, &home_device_rw,
3116 &srv_device, &srv_device_rw,
3122 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3124 log_error("Failed to acquire pseudo tty: %m");
3128 console = ptsname(master);
3130 log_error("Failed to determine tty name: %m");
3135 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3136 arg_machine, arg_image ? arg_image : arg_directory);
3138 if (unlockpt(master) < 0) {
3139 log_error("Failed to unlock tty: %m");
3143 if (access("/dev/kdbus/control", F_OK) >= 0) {
3145 if (arg_share_system) {
3146 kdbus_domain = strdup("/dev/kdbus");
3147 if (!kdbus_domain) {
3154 ns = strappenda("machine-", arg_machine);
3155 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3157 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3159 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3163 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3164 log_error("Failed to create kmsg socket pair: %m");
3170 "STATUS=Container running.");
3172 assert_se(sigemptyset(&mask) == 0);
3173 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3174 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3176 assert_se(sigemptyset(&mask_chld) == 0);
3177 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3180 ContainerStatus container_status;
3181 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3182 struct sigaction sa = {
3183 .sa_handler = nop_handler,
3184 .sa_flags = SA_NOCLDSTOP,
3187 r = barrier_create(&barrier);
3189 log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3193 /* Child can be killed before execv(), so handle SIGCHLD
3194 * in order to interrupt parent's blocking calls and
3195 * give it a chance to call wait() and terminate. */
3196 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3198 log_error("Failed to change the signal mask: %m");
3202 r = sigaction(SIGCHLD, &sa, NULL);
3204 log_error("Failed to install SIGCHLD handler: %m");
3208 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3209 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3210 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3212 if (errno == EINVAL)
3213 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3215 log_error("clone() failed: %m");
3223 _cleanup_free_ char *home = NULL;
3225 const char *envp[] = {
3226 "PATH=" DEFAULT_PATH_SPLIT_USR,
3227 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3232 NULL, /* container_uuid */
3233 NULL, /* LISTEN_FDS */
3234 NULL, /* LISTEN_PID */
3239 barrier_set_role(&barrier, BARRIER_CHILD);
3241 envp[n_env] = strv_find_prefix(environ, "TERM=");
3245 master = safe_close(master);
3247 close_nointr(STDIN_FILENO);
3248 close_nointr(STDOUT_FILENO);
3249 close_nointr(STDERR_FILENO);
3251 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3253 reset_all_signal_handlers();
3254 reset_signal_mask();
3256 k = open_terminal(console, O_RDWR);
3257 if (k != STDIN_FILENO) {
3263 log_error("Failed to open console: %s", strerror(-k));
3264 _exit(EXIT_FAILURE);
3267 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3268 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3269 log_error("Failed to duplicate console: %m");
3270 _exit(EXIT_FAILURE);
3274 log_error("setsid() failed: %m");
3275 _exit(EXIT_FAILURE);
3278 if (reset_audit_loginuid() < 0)
3279 _exit(EXIT_FAILURE);
3281 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3282 log_error("PR_SET_PDEATHSIG failed: %m");
3283 _exit(EXIT_FAILURE);
3286 /* Mark everything as slave, so that we still
3287 * receive mounts from the real root, but don't
3288 * propagate mounts to the real root. */
3289 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3290 log_error("MS_SLAVE|MS_REC failed: %m");
3291 _exit(EXIT_FAILURE);
3294 if (mount_devices(arg_directory,
3295 root_device, root_device_rw,
3296 home_device, home_device_rw,
3297 srv_device, srv_device_rw) < 0)
3298 _exit(EXIT_FAILURE);
3300 /* Turn directory into bind mount */
3301 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3302 log_error("Failed to make bind mount: %m");
3303 _exit(EXIT_FAILURE);
3306 r = setup_volatile(arg_directory);
3308 _exit(EXIT_FAILURE);
3310 if (setup_volatile_state(arg_directory) < 0)
3311 _exit(EXIT_FAILURE);
3313 r = base_filesystem_create(arg_directory);
3315 _exit(EXIT_FAILURE);
3317 if (arg_read_only) {
3318 k = bind_remount_recursive(arg_directory, true);
3320 log_error("Failed to make tree read-only: %s", strerror(-k));
3321 _exit(EXIT_FAILURE);
3325 if (mount_all(arg_directory) < 0)
3326 _exit(EXIT_FAILURE);
3328 if (copy_devnodes(arg_directory) < 0)
3329 _exit(EXIT_FAILURE);
3331 if (setup_ptmx(arg_directory) < 0)
3332 _exit(EXIT_FAILURE);
3334 dev_setup(arg_directory);
3336 if (setup_seccomp() < 0)
3337 _exit(EXIT_FAILURE);
3339 if (setup_dev_console(arg_directory, console) < 0)
3340 _exit(EXIT_FAILURE);
3342 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3343 _exit(EXIT_FAILURE);
3345 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3347 if (setup_boot_id(arg_directory) < 0)
3348 _exit(EXIT_FAILURE);
3350 if (setup_timezone(arg_directory) < 0)
3351 _exit(EXIT_FAILURE);
3353 if (setup_resolv_conf(arg_directory) < 0)
3354 _exit(EXIT_FAILURE);
3356 if (setup_journal(arg_directory) < 0)
3357 _exit(EXIT_FAILURE);
3359 if (mount_binds(arg_directory, arg_bind, false) < 0)
3360 _exit(EXIT_FAILURE);
3362 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3363 _exit(EXIT_FAILURE);
3365 if (mount_tmpfs(arg_directory) < 0)
3366 _exit(EXIT_FAILURE);
3368 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3369 _exit(EXIT_FAILURE);
3371 /* Tell the parent that we are ready, and that
3372 * it can cgroupify us to that we lack access
3373 * to certain devices and resources. */
3374 (void)barrier_place(&barrier);
3376 if (chdir(arg_directory) < 0) {
3377 log_error("chdir(%s) failed: %m", arg_directory);
3378 _exit(EXIT_FAILURE);
3381 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3382 log_error("mount(MS_MOVE) failed: %m");
3383 _exit(EXIT_FAILURE);
3386 if (chroot(".") < 0) {
3387 log_error("chroot() failed: %m");
3388 _exit(EXIT_FAILURE);
3391 if (chdir("/") < 0) {
3392 log_error("chdir() failed: %m");
3393 _exit(EXIT_FAILURE);
3398 if (arg_private_network)
3401 if (drop_capabilities() < 0) {
3402 log_error("drop_capabilities() failed: %m");
3403 _exit(EXIT_FAILURE);
3406 r = change_uid_gid(&home);
3408 _exit(EXIT_FAILURE);
3410 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3411 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3412 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3414 _exit(EXIT_FAILURE);
3417 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3420 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3422 _exit(EXIT_FAILURE);
3426 if (fdset_size(fds) > 0) {
3427 k = fdset_cloexec(fds, false);
3429 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3430 _exit(EXIT_FAILURE);
3433 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3434 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3436 _exit(EXIT_FAILURE);
3442 if (arg_personality != 0xffffffffLU) {
3443 if (personality(arg_personality) < 0) {
3444 log_error("personality() failed: %m");
3445 _exit(EXIT_FAILURE);
3447 } else if (secondary) {
3448 if (personality(PER_LINUX32) < 0) {
3449 log_error("personality() failed: %m");
3450 _exit(EXIT_FAILURE);
3455 if (arg_selinux_context)
3456 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3457 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3458 _exit(EXIT_FAILURE);
3462 if (!strv_isempty(arg_setenv)) {
3465 n = strv_env_merge(2, envp, arg_setenv);
3468 _exit(EXIT_FAILURE);
3473 env_use = (char**) envp;
3475 /* Wait until the parent is ready with the setup, too... */
3476 if (!barrier_place_and_sync(&barrier))
3477 _exit(EXIT_FAILURE);
3483 /* Automatically search for the init system */
3485 l = 1 + argc - optind;
3486 a = newa(char*, l + 1);
3487 memcpy(a + 1, argv + optind, l * sizeof(char*));
3489 a[0] = (char*) "/usr/lib/systemd/systemd";
3490 execve(a[0], a, env_use);
3492 a[0] = (char*) "/lib/systemd/systemd";
3493 execve(a[0], a, env_use);
3495 a[0] = (char*) "/sbin/init";
3496 execve(a[0], a, env_use);
3497 } else if (argc > optind)
3498 execvpe(argv[optind], argv + optind, env_use);
3500 chdir(home ? home : "/root");
3501 execle("/bin/bash", "-bash", NULL, env_use);
3502 execle("/bin/sh", "-sh", NULL, env_use);
3505 log_error("execv() failed: %m");
3506 _exit(EXIT_FAILURE);
3509 barrier_set_role(&barrier, BARRIER_PARENT);
3513 /* wait for child-setup to be done */
3514 if (barrier_place_and_sync(&barrier)) {
3515 _cleanup_event_unref_ sd_event *event = NULL;
3516 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3519 r = move_network_interfaces(pid);
3523 r = setup_veth(pid, veth_name, &ifi);
3527 r = setup_bridge(veth_name, &ifi);
3531 r = setup_macvlan(pid);
3535 r = register_machine(pid, ifi);
3539 /* Block SIGCHLD here, before notifying child.
3540 * process_pty() will handle it with the other signals. */
3541 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3545 /* Reset signal to default */
3546 r = default_signals(SIGCHLD, -1);
3550 /* Notify the child that the parent is ready with all
3551 * its setup, and that the child can now hand over
3552 * control to the code to run inside the container. */
3553 (void)barrier_place(&barrier);
3555 r = sd_event_new(&event);
3557 log_error("Failed to get default event source: %s", strerror(-r));
3562 /* Try to kill the init system on SIGINT or SIGTERM */
3563 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3564 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3566 /* Immediately exit */
3567 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3568 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3571 /* simply exit on sigchld */
3572 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3574 r = pty_forward_new(event, master, &forward);
3576 log_error("Failed to create PTY forwarder: %s", strerror(-r));
3580 r = sd_event_loop(event);
3582 log_error("Failed to run event loop: %s", strerror(-r));
3586 forward = pty_forward_free(forward);
3591 /* Kill if it is not dead yet anyway */
3592 terminate_machine(pid);
3595 /* Normally redundant, but better safe than sorry */
3598 r = wait_for_container(pid, &container_status);
3602 /* We failed to wait for the container, or the
3603 * container exited abnormally */
3606 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3607 /* The container exited with a non-zero
3608 * status, or with zero status and no reboot
3612 /* CONTAINER_REBOOTED, loop again */
3614 if (arg_keep_unit) {
3615 /* Special handling if we are running as a
3616 * service: instead of simply restarting the
3617 * machine we want to restart the entire
3618 * service, so let's inform systemd about this
3619 * with the special exit code 133. The service
3620 * file uses RestartForceExitStatus=133 so
3621 * that this results in a full nspawn
3622 * restart. This is necessary since we might
3623 * have cgroup parameters set we want to have
3633 "STATUS=Terminating...");
3635 loop_remove(loop_nr, &image_fd);
3640 free(arg_directory);
3643 strv_free(arg_setenv);
3644 strv_free(arg_network_interfaces);
3645 strv_free(arg_network_macvlan);
3646 strv_free(arg_bind);
3647 strv_free(arg_bind_ro);
3648 strv_free(arg_tmpfs);