1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
64 #include "cgroup-util.h"
66 #include "path-util.h"
67 #include "loopback-setup.h"
68 #include "dev-setup.h"
73 #include "bus-error.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
78 #include "blkid-util.h"
80 #include "siphash24.h"
82 #include "base-filesystem.h"
84 #include "event-util.h"
85 #include "capability.h"
87 #include "btrfs-util.h"
88 #include "machine-image.h"
90 #include "in-addr-util.h"
92 #include "local-addresses.h"
95 #include "seccomp-util.h"
98 typedef struct ExposePort {
101 uint16_t container_port;
102 LIST_FIELDS(struct ExposePort, ports);
105 typedef enum ContainerStatus {
106 CONTAINER_TERMINATED,
110 typedef enum LinkJournal {
117 typedef enum Volatile {
123 static char *arg_directory = NULL;
124 static char *arg_template = NULL;
125 static char *arg_user = NULL;
126 static sd_id128_t arg_uuid = {};
127 static char *arg_machine = NULL;
128 static const char *arg_selinux_context = NULL;
129 static const char *arg_selinux_apifs_context = NULL;
130 static const char *arg_slice = NULL;
131 static bool arg_private_network = false;
132 static bool arg_read_only = false;
133 static bool arg_boot = false;
134 static bool arg_ephemeral = false;
135 static LinkJournal arg_link_journal = LINK_AUTO;
136 static bool arg_link_journal_try = false;
137 static uint64_t arg_retain =
138 (1ULL << CAP_CHOWN) |
139 (1ULL << CAP_DAC_OVERRIDE) |
140 (1ULL << CAP_DAC_READ_SEARCH) |
141 (1ULL << CAP_FOWNER) |
142 (1ULL << CAP_FSETID) |
143 (1ULL << CAP_IPC_OWNER) |
145 (1ULL << CAP_LEASE) |
146 (1ULL << CAP_LINUX_IMMUTABLE) |
147 (1ULL << CAP_NET_BIND_SERVICE) |
148 (1ULL << CAP_NET_BROADCAST) |
149 (1ULL << CAP_NET_RAW) |
150 (1ULL << CAP_SETGID) |
151 (1ULL << CAP_SETFCAP) |
152 (1ULL << CAP_SETPCAP) |
153 (1ULL << CAP_SETUID) |
154 (1ULL << CAP_SYS_ADMIN) |
155 (1ULL << CAP_SYS_CHROOT) |
156 (1ULL << CAP_SYS_NICE) |
157 (1ULL << CAP_SYS_PTRACE) |
158 (1ULL << CAP_SYS_TTY_CONFIG) |
159 (1ULL << CAP_SYS_RESOURCE) |
160 (1ULL << CAP_SYS_BOOT) |
161 (1ULL << CAP_AUDIT_WRITE) |
162 (1ULL << CAP_AUDIT_CONTROL) |
164 static char **arg_bind = NULL;
165 static char **arg_bind_ro = NULL;
166 static char **arg_tmpfs = NULL;
167 static char **arg_setenv = NULL;
168 static bool arg_quiet = false;
169 static bool arg_share_system = false;
170 static bool arg_register = true;
171 static bool arg_keep_unit = false;
172 static char **arg_network_interfaces = NULL;
173 static char **arg_network_macvlan = NULL;
174 static char **arg_network_ipvlan = NULL;
175 static bool arg_network_veth = false;
176 static const char *arg_network_bridge = NULL;
177 static unsigned long arg_personality = 0xffffffffLU;
178 static char *arg_image = NULL;
179 static Volatile arg_volatile = VOLATILE_NO;
180 static ExposePort *arg_expose_ports = NULL;
181 static char **arg_property = NULL;
182 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183 static bool arg_userns = false;
184 static int arg_kill_signal = 0;
186 static void help(void) {
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189 " -h --help Show this help\n"
190 " --version Print version string\n"
191 " -q --quiet Do not show status information\n"
192 " -D --directory=PATH Root directory for the container\n"
193 " --template=PATH Initialize root directory from template directory,\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
200 " -M --machine=NAME Set the machine name for the container\n"
201 " --uuid=UUID Set a specific machine UUID for the container\n"
202 " -S --slice=SLICE Place the container in the specified slice\n"
203 " --property=NAME=VALUE Set scope unit property\n"
204 " --private-network Disable network in container\n"
205 " --network-interface=INTERFACE\n"
206 " Assign an existing network interface to the\n"
208 " --network-macvlan=INTERFACE\n"
209 " Create a macvlan network interface based on an\n"
210 " existing network interface to the container\n"
211 " --network-ipvlan=INTERFACE\n"
212 " Create a ipvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " -n --network-veth Add a virtual ethernet connection between host\n"
216 " --network-bridge=INTERFACE\n"
217 " Add a virtual ethernet connection between host\n"
218 " and container and add it to an existing bridge on\n"
220 " --private-users[=UIDBASE[:NUIDS]]\n"
221 " Run within user namespace\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
240 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
241 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
242 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
243 " --share-system Share system namespaces with host\n"
244 " --register=BOOLEAN Register container as machine\n"
245 " --keep-unit Do not register a scope for the machine, reuse\n"
246 " the service unit nspawn is running in\n"
247 " --volatile[=MODE] Run the system in volatile mode\n"
248 , program_invocation_short_name);
251 static int set_sanitized_path(char **b, const char *path) {
257 p = canonicalize_file_name(path);
262 p = path_make_absolute_cwd(path);
268 *b = path_kill_slashes(p);
272 static int parse_argv(int argc, char *argv[]) {
289 ARG_NETWORK_INTERFACE,
301 static const struct option options[] = {
302 { "help", no_argument, NULL, 'h' },
303 { "version", no_argument, NULL, ARG_VERSION },
304 { "directory", required_argument, NULL, 'D' },
305 { "template", required_argument, NULL, ARG_TEMPLATE },
306 { "ephemeral", no_argument, NULL, 'x' },
307 { "user", required_argument, NULL, 'u' },
308 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
309 { "boot", no_argument, NULL, 'b' },
310 { "uuid", required_argument, NULL, ARG_UUID },
311 { "read-only", no_argument, NULL, ARG_READ_ONLY },
312 { "capability", required_argument, NULL, ARG_CAPABILITY },
313 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
314 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
315 { "bind", required_argument, NULL, ARG_BIND },
316 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
317 { "tmpfs", required_argument, NULL, ARG_TMPFS },
318 { "machine", required_argument, NULL, 'M' },
319 { "slice", required_argument, NULL, 'S' },
320 { "setenv", required_argument, NULL, ARG_SETENV },
321 { "selinux-context", required_argument, NULL, 'Z' },
322 { "selinux-apifs-context", required_argument, NULL, 'L' },
323 { "quiet", no_argument, NULL, 'q' },
324 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
325 { "register", required_argument, NULL, ARG_REGISTER },
326 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
327 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
328 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
329 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
330 { "network-veth", no_argument, NULL, 'n' },
331 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
332 { "personality", required_argument, NULL, ARG_PERSONALITY },
333 { "image", required_argument, NULL, 'i' },
334 { "volatile", optional_argument, NULL, ARG_VOLATILE },
335 { "port", required_argument, NULL, 'p' },
336 { "property", required_argument, NULL, ARG_PROPERTY },
337 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
338 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
343 uint64_t plus = 0, minus = 0;
348 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
357 puts(PACKAGE_STRING);
358 puts(SYSTEMD_FEATURES);
362 r = set_sanitized_path(&arg_directory, optarg);
364 return log_error_errno(r, "Invalid root directory: %m");
369 r = set_sanitized_path(&arg_template, optarg);
371 return log_error_errno(r, "Invalid template directory: %m");
376 r = set_sanitized_path(&arg_image, optarg);
378 return log_error_errno(r, "Invalid image path: %m");
383 arg_ephemeral = true;
388 arg_user = strdup(optarg);
394 case ARG_NETWORK_BRIDGE:
395 arg_network_bridge = optarg;
400 arg_network_veth = true;
401 arg_private_network = true;
404 case ARG_NETWORK_INTERFACE:
405 if (strv_extend(&arg_network_interfaces, optarg) < 0)
408 arg_private_network = true;
411 case ARG_NETWORK_MACVLAN:
412 if (strv_extend(&arg_network_macvlan, optarg) < 0)
415 arg_private_network = true;
418 case ARG_NETWORK_IPVLAN:
419 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
424 case ARG_PRIVATE_NETWORK:
425 arg_private_network = true;
433 r = sd_id128_from_string(optarg, &arg_uuid);
435 log_error("Invalid UUID: %s", optarg);
445 if (isempty(optarg)) {
449 if (!machine_name_is_valid(optarg)) {
450 log_error("Invalid machine name: %s", optarg);
454 r = free_and_strdup(&arg_machine, optarg);
462 arg_selinux_context = optarg;
466 arg_selinux_apifs_context = optarg;
470 arg_read_only = true;
474 case ARG_DROP_CAPABILITY: {
475 const char *state, *word;
478 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
479 _cleanup_free_ char *t;
481 t = strndup(word, length);
485 if (streq(t, "all")) {
486 if (c == ARG_CAPABILITY)
487 plus = (uint64_t) -1;
489 minus = (uint64_t) -1;
493 cap = capability_from_name(t);
495 log_error("Failed to parse capability %s.", t);
499 if (c == ARG_CAPABILITY)
500 plus |= 1ULL << (uint64_t) cap;
502 minus |= 1ULL << (uint64_t) cap;
510 arg_link_journal = LINK_GUEST;
511 arg_link_journal_try = true;
514 case ARG_LINK_JOURNAL:
515 if (streq(optarg, "auto")) {
516 arg_link_journal = LINK_AUTO;
517 arg_link_journal_try = false;
518 } else if (streq(optarg, "no")) {
519 arg_link_journal = LINK_NO;
520 arg_link_journal_try = false;
521 } else if (streq(optarg, "guest")) {
522 arg_link_journal = LINK_GUEST;
523 arg_link_journal_try = false;
524 } else if (streq(optarg, "host")) {
525 arg_link_journal = LINK_HOST;
526 arg_link_journal_try = false;
527 } else if (streq(optarg, "try-guest")) {
528 arg_link_journal = LINK_GUEST;
529 arg_link_journal_try = true;
530 } else if (streq(optarg, "try-host")) {
531 arg_link_journal = LINK_HOST;
532 arg_link_journal_try = true;
534 log_error("Failed to parse link journal mode %s", optarg);
542 _cleanup_free_ char *a = NULL, *b = NULL;
546 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
548 e = strchr(optarg, ':');
550 a = strndup(optarg, e - optarg);
560 if (!path_is_absolute(a) || !path_is_absolute(b)) {
561 log_error("Invalid bind mount specification: %s", optarg);
565 r = strv_extend(x, a);
569 r = strv_extend(x, b);
577 _cleanup_free_ char *a = NULL, *b = NULL;
580 e = strchr(optarg, ':');
582 a = strndup(optarg, e - optarg);
586 b = strdup("mode=0755");
592 if (!path_is_absolute(a)) {
593 log_error("Invalid tmpfs specification: %s", optarg);
597 r = strv_push(&arg_tmpfs, a);
603 r = strv_push(&arg_tmpfs, b);
615 if (!env_assignment_is_valid(optarg)) {
616 log_error("Environment variable assignment '%s' is not valid.", optarg);
620 n = strv_env_set(arg_setenv, optarg);
624 strv_free(arg_setenv);
633 case ARG_SHARE_SYSTEM:
634 arg_share_system = true;
638 r = parse_boolean(optarg);
640 log_error("Failed to parse --register= argument: %s", optarg);
648 arg_keep_unit = true;
651 case ARG_PERSONALITY:
653 arg_personality = personality_from_string(optarg);
654 if (arg_personality == 0xffffffffLU) {
655 log_error("Unknown or unsupported personality '%s'.", optarg);
664 arg_volatile = VOLATILE_YES;
666 r = parse_boolean(optarg);
668 if (streq(optarg, "state"))
669 arg_volatile = VOLATILE_STATE;
671 log_error("Failed to parse --volatile= argument: %s", optarg);
675 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
681 const char *split, *e;
682 uint16_t container_port, host_port;
686 if ((e = startswith(optarg, "tcp:")))
687 protocol = IPPROTO_TCP;
688 else if ((e = startswith(optarg, "udp:")))
689 protocol = IPPROTO_UDP;
692 protocol = IPPROTO_TCP;
695 split = strchr(e, ':');
697 char v[split - e + 1];
699 memcpy(v, e, split - e);
702 r = safe_atou16(v, &host_port);
703 if (r < 0 || host_port <= 0) {
704 log_error("Failed to parse host port: %s", optarg);
708 r = safe_atou16(split + 1, &container_port);
710 r = safe_atou16(e, &container_port);
711 host_port = container_port;
714 if (r < 0 || container_port <= 0) {
715 log_error("Failed to parse host port: %s", optarg);
719 LIST_FOREACH(ports, p, arg_expose_ports) {
720 if (p->protocol == protocol && p->host_port == host_port) {
721 log_error("Duplicate port specification: %s", optarg);
726 p = new(ExposePort, 1);
730 p->protocol = protocol;
731 p->host_port = host_port;
732 p->container_port = container_port;
734 LIST_PREPEND(ports, arg_expose_ports, p);
740 if (strv_extend(&arg_property, optarg) < 0)
745 case ARG_PRIVATE_USERS:
747 _cleanup_free_ char *buffer = NULL;
748 const char *range, *shift;
750 range = strchr(optarg, ':');
752 buffer = strndup(optarg, range - optarg);
758 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
759 log_error("Failed to parse UID range: %s", range);
765 if (parse_uid(shift, &arg_uid_shift) < 0) {
766 log_error("Failed to parse UID: %s", optarg);
774 case ARG_KILL_SIGNAL:
775 arg_kill_signal = signal_from_string_try_harder(optarg);
776 if (arg_kill_signal < 0) {
777 log_error("Cannot parse signal: %s", optarg);
787 assert_not_reached("Unhandled option");
790 if (arg_share_system)
791 arg_register = false;
793 if (arg_boot && arg_share_system) {
794 log_error("--boot and --share-system may not be combined.");
798 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
799 log_error("--keep-unit may not be used when invoked from a user session.");
803 if (arg_directory && arg_image) {
804 log_error("--directory= and --image= may not be combined.");
808 if (arg_template && arg_image) {
809 log_error("--template= and --image= may not be combined.");
813 if (arg_template && !(arg_directory || arg_machine)) {
814 log_error("--template= needs --directory= or --machine=.");
818 if (arg_ephemeral && arg_template) {
819 log_error("--ephemeral and --template= may not be combined.");
823 if (arg_ephemeral && arg_image) {
824 log_error("--ephemeral and --image= may not be combined.");
828 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
829 log_error("--ephemeral and --link-journal= may not be combined.");
833 if (arg_volatile != VOLATILE_NO && arg_read_only) {
834 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
838 if (arg_expose_ports && !arg_private_network) {
839 log_error("Cannot use --port= without private networking.");
843 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
845 if (arg_boot && arg_kill_signal <= 0)
846 arg_kill_signal = SIGRTMIN+3;
851 static int mount_all(const char *dest) {
853 typedef struct MountPoint {
862 static const MountPoint mount_table[] = {
863 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
864 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
865 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
866 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
867 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
868 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
869 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
870 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
871 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
873 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
874 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
881 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
882 _cleanup_free_ char *where = NULL, *options = NULL;
886 where = strjoin(dest, "/", mount_table[k].where, NULL);
890 t = path_is_mount_point(where, true);
892 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
900 /* Skip this entry if it is not a remount. */
901 if (mount_table[k].what && t > 0)
904 t = mkdir_p(where, 0755);
906 if (mount_table[k].fatal) {
907 log_error_errno(t, "Failed to create directory %s: %m", where);
912 log_warning_errno(t, "Failed to create directory %s: %m", where);
918 if (arg_selinux_apifs_context &&
919 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
920 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
927 o = mount_table[k].options;
929 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
930 char *uid_options = NULL;
933 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
935 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
940 o = options = uid_options;
943 if (mount(mount_table[k].what,
946 mount_table[k].flags,
949 if (mount_table[k].fatal) {
950 log_error_errno(errno, "mount(%s) failed: %m", where);
955 log_warning_errno(errno, "mount(%s) failed: %m", where);
962 static int mount_binds(const char *dest, char **l, bool ro) {
965 STRV_FOREACH_PAIR(x, y, l) {
966 _cleanup_free_ char *where = NULL;
967 struct stat source_st, dest_st;
970 if (stat(*x, &source_st) < 0)
971 return log_error_errno(errno, "Failed to stat %s: %m", *x);
973 where = strappend(dest, *y);
977 r = stat(where, &dest_st);
979 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
980 log_error("Cannot bind mount directory %s on file %s.", *x, where);
983 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
984 log_error("Cannot bind mount file %s on directory %s.", *x, where);
987 } else if (errno == ENOENT) {
988 r = mkdir_parents_label(where, 0755);
990 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
992 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
996 /* Create the mount point. Any non-directory file can be
997 * mounted on any non-directory file (regular, fifo, socket,
1000 if (S_ISDIR(source_st.st_mode)) {
1001 r = mkdir_label(where, 0755);
1002 if (r < 0 && errno != EEXIST)
1003 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1007 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1010 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1011 return log_error_errno(errno, "mount(%s) failed: %m", where);
1014 r = bind_remount_recursive(where, true);
1016 return log_error_errno(r, "Read-Only bind mount failed: %m");
1023 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1027 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1029 r = path_is_mount_point(to, false);
1031 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1037 /* The superblock mount options of the mount point need to be
1038 * identical to the hosts', and hence writable... */
1039 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1040 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1042 /* ... hence let's only make the bind mount read-only, not the
1045 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1046 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1051 static int mount_cgroup(const char *dest) {
1052 _cleanup_set_free_free_ Set *controllers = NULL;
1053 _cleanup_free_ char *own_cgroup_path = NULL;
1054 const char *cgroup_root, *systemd_root, *systemd_own;
1057 controllers = set_new(&string_hash_ops);
1061 r = cg_kernel_controllers(controllers);
1063 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1065 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1067 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1069 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1070 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1071 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1074 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1076 controller = set_steal_first(controllers);
1080 origin = strappend("/sys/fs/cgroup/", controller);
1084 r = readlink_malloc(origin, &combined);
1086 /* Not a symbolic link, but directly a single cgroup hierarchy */
1088 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1093 return log_error_errno(r, "Failed to read link %s: %m", origin);
1095 _cleanup_free_ char *target = NULL;
1097 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1101 /* A symbolic link, a combination of controllers in one hierarchy */
1103 if (!filename_is_valid(combined)) {
1104 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1108 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1112 if (symlink(combined, target) < 0)
1113 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1117 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1121 /* Make our own cgroup a (writable) bind mount */
1122 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1123 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1124 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1126 /* And then remount the systemd cgroup root read-only */
1127 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1128 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1129 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1131 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1132 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1137 static int mount_tmpfs(const char *dest) {
1140 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1141 _cleanup_free_ char *where = NULL;
1144 where = strappend(dest, *i);
1148 r = mkdir_label(where, 0755);
1149 if (r < 0 && r != -EEXIST)
1150 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1152 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1153 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1159 static int setup_timezone(const char *dest) {
1160 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1166 /* Fix the timezone, if possible */
1167 r = readlink_malloc("/etc/localtime", &p);
1169 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1173 z = path_startswith(p, "../usr/share/zoneinfo/");
1175 z = path_startswith(p, "/usr/share/zoneinfo/");
1177 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1181 where = strappend(dest, "/etc/localtime");
1185 r = readlink_malloc(where, &q);
1187 y = path_startswith(q, "../usr/share/zoneinfo/");
1189 y = path_startswith(q, "/usr/share/zoneinfo/");
1191 /* Already pointing to the right place? Then do nothing .. */
1192 if (y && streq(y, z))
1196 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1200 if (access(check, F_OK) < 0) {
1201 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1205 what = strappend("../usr/share/zoneinfo/", z);
1209 r = mkdir_parents(where, 0755);
1211 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1217 if (r < 0 && errno != ENOENT) {
1218 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1223 if (symlink(what, where) < 0) {
1224 log_error_errno(errno, "Failed to correct timezone of container: %m");
1231 static int setup_resolv_conf(const char *dest) {
1232 _cleanup_free_ char *where = NULL;
1237 if (arg_private_network)
1240 /* Fix resolv.conf, if possible */
1241 where = strappend(dest, "/etc/resolv.conf");
1245 /* We don't really care for the results of this really. If it
1246 * fails, it fails, but meh... */
1247 r = mkdir_parents(where, 0755);
1249 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1254 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1256 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1264 static int setup_volatile_state(const char *directory) {
1270 if (arg_volatile != VOLATILE_STATE)
1273 /* --volatile=state means we simply overmount /var
1274 with a tmpfs, and the rest read-only. */
1276 r = bind_remount_recursive(directory, true);
1278 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1280 p = strjoina(directory, "/var");
1282 if (r < 0 && errno != EEXIST)
1283 return log_error_errno(errno, "Failed to create %s: %m", directory);
1285 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1286 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1291 static int setup_volatile(const char *directory) {
1292 bool tmpfs_mounted = false, bind_mounted = false;
1293 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1299 if (arg_volatile != VOLATILE_YES)
1302 /* --volatile=yes means we mount a tmpfs to the root dir, and
1303 the original /usr to use inside it, and that read-only. */
1305 if (!mkdtemp(template))
1306 return log_error_errno(errno, "Failed to create temporary directory: %m");
1308 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1309 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1314 tmpfs_mounted = true;
1316 f = strjoina(directory, "/usr");
1317 t = strjoina(template, "/usr");
1320 if (r < 0 && errno != EEXIST) {
1321 log_error_errno(errno, "Failed to create %s: %m", t);
1326 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1327 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1332 bind_mounted = true;
1334 r = bind_remount_recursive(t, true);
1336 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1340 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1341 log_error_errno(errno, "Failed to move root mount: %m");
1359 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1362 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1363 SD_ID128_FORMAT_VAL(id));
1368 static int setup_boot_id(const char *dest) {
1369 _cleanup_free_ char *from = NULL, *to = NULL;
1370 sd_id128_t rnd = {};
1376 if (arg_share_system)
1379 /* Generate a new randomized boot ID, so that each boot-up of
1380 * the container gets a new one */
1382 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1383 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1387 r = sd_id128_randomize(&rnd);
1389 return log_error_errno(r, "Failed to generate random boot id: %m");
1391 id128_format_as_uuid(rnd, as_uuid);
1393 r = write_string_file(from, as_uuid);
1395 return log_error_errno(r, "Failed to write boot id: %m");
1397 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1398 log_error_errno(errno, "Failed to bind mount boot id: %m");
1400 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1401 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1407 static int copy_devnodes(const char *dest) {
1409 static const char devnodes[] =
1420 _cleanup_umask_ mode_t u;
1426 NULSTR_FOREACH(d, devnodes) {
1427 _cleanup_free_ char *from = NULL, *to = NULL;
1430 from = strappend("/dev/", d);
1431 to = strjoin(dest, "/dev/", d, NULL);
1435 if (stat(from, &st) < 0) {
1437 if (errno != ENOENT)
1438 return log_error_errno(errno, "Failed to stat %s: %m", from);
1440 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1442 log_error("%s is not a char or block device, cannot copy", from);
1446 r = mkdir_parents(to, 0775);
1448 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1452 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1453 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1455 if (arg_userns && arg_uid_shift != UID_INVALID)
1456 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1457 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1464 static int setup_ptmx(const char *dest) {
1465 _cleanup_free_ char *p = NULL;
1467 p = strappend(dest, "/dev/ptmx");
1471 if (symlink("pts/ptmx", p) < 0)
1472 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1474 if (arg_userns && arg_uid_shift != UID_INVALID)
1475 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1476 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1481 static int setup_dev_console(const char *dest, const char *console) {
1482 _cleanup_umask_ mode_t u;
1492 if (stat("/dev/null", &st) < 0)
1493 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1495 r = chmod_and_chown(console, 0600, 0, 0);
1497 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1499 /* We need to bind mount the right tty to /dev/console since
1500 * ptys can only exist on pts file systems. To have something
1501 * to bind mount things on we create a device node first, and
1502 * use /dev/null for that since we the cgroups device policy
1503 * allows us to create that freely, while we cannot create
1504 * /dev/console. (Note that the major minor doesn't actually
1505 * matter here, since we mount it over anyway). */
1507 to = strjoina(dest, "/dev/console");
1508 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1509 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1511 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1512 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1517 static int setup_kmsg(const char *dest, int kmsg_socket) {
1518 _cleanup_free_ char *from = NULL, *to = NULL;
1519 _cleanup_umask_ mode_t u;
1522 struct cmsghdr cmsghdr;
1523 uint8_t buf[CMSG_SPACE(sizeof(int))];
1525 struct msghdr mh = {
1526 .msg_control = &control,
1527 .msg_controllen = sizeof(control),
1529 struct cmsghdr *cmsg;
1532 assert(kmsg_socket >= 0);
1536 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1537 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1538 * on the reading side behave very similar to /proc/kmsg,
1539 * their writing side behaves differently from /dev/kmsg in
1540 * that writing blocks when nothing is reading. In order to
1541 * avoid any problems with containers deadlocking due to this
1542 * we simply make /dev/kmsg unavailable to the container. */
1543 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1544 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1547 if (mkfifo(from, 0600) < 0)
1548 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1550 r = chmod_and_chown(from, 0600, 0, 0);
1552 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1554 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1555 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1557 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1559 return log_error_errno(errno, "Failed to open fifo: %m");
1561 cmsg = CMSG_FIRSTHDR(&mh);
1562 cmsg->cmsg_level = SOL_SOCKET;
1563 cmsg->cmsg_type = SCM_RIGHTS;
1564 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1565 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1567 mh.msg_controllen = cmsg->cmsg_len;
1569 /* Store away the fd in the socket, so that it stays open as
1570 * long as we run the child */
1571 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1575 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1577 /* And now make the FIFO unavailable as /dev/kmsg... */
1582 static int send_rtnl(int send_fd) {
1584 struct cmsghdr cmsghdr;
1585 uint8_t buf[CMSG_SPACE(sizeof(int))];
1587 struct msghdr mh = {
1588 .msg_control = &control,
1589 .msg_controllen = sizeof(control),
1591 struct cmsghdr *cmsg;
1592 _cleanup_close_ int fd = -1;
1595 assert(send_fd >= 0);
1597 if (!arg_expose_ports)
1600 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1602 return log_error_errno(errno, "failed to allocate container netlink: %m");
1604 cmsg = CMSG_FIRSTHDR(&mh);
1605 cmsg->cmsg_level = SOL_SOCKET;
1606 cmsg->cmsg_type = SCM_RIGHTS;
1607 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1608 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1610 mh.msg_controllen = cmsg->cmsg_len;
1612 /* Store away the fd in the socket, so that it stays open as
1613 * long as we run the child */
1614 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1616 return log_error_errno(errno, "Failed to send netlink fd: %m");
1621 static int flush_ports(union in_addr_union *exposed) {
1623 int r, af = AF_INET;
1627 if (!arg_expose_ports)
1630 if (in_addr_is_null(af, exposed))
1633 log_debug("Lost IP address.");
1635 LIST_FOREACH(ports, p, arg_expose_ports) {
1636 r = fw_add_local_dnat(false,
1647 log_warning_errno(r, "Failed to modify firewall: %m");
1650 *exposed = IN_ADDR_NULL;
1654 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1655 _cleanup_free_ struct local_address *addresses = NULL;
1656 _cleanup_free_ char *pretty = NULL;
1657 union in_addr_union new_exposed;
1660 int af = AF_INET, r;
1664 /* Invoked each time an address is added or removed inside the
1667 if (!arg_expose_ports)
1670 r = local_addresses(rtnl, 0, af, &addresses);
1672 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1675 addresses[0].family == af &&
1676 addresses[0].scope < RT_SCOPE_LINK;
1679 return flush_ports(exposed);
1681 new_exposed = addresses[0].address;
1682 if (in_addr_equal(af, exposed, &new_exposed))
1685 in_addr_to_string(af, &new_exposed, &pretty);
1686 log_debug("New container IP is %s.", strna(pretty));
1688 LIST_FOREACH(ports, p, arg_expose_ports) {
1690 r = fw_add_local_dnat(true,
1699 in_addr_is_null(af, exposed) ? NULL : exposed);
1701 log_warning_errno(r, "Failed to modify firewall: %m");
1704 *exposed = new_exposed;
1708 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1709 union in_addr_union *exposed = userdata;
1715 expose_ports(rtnl, exposed);
1719 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1721 struct cmsghdr cmsghdr;
1722 uint8_t buf[CMSG_SPACE(sizeof(int))];
1724 struct msghdr mh = {
1725 .msg_control = &control,
1726 .msg_controllen = sizeof(control),
1728 struct cmsghdr *cmsg;
1729 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1734 assert(recv_fd >= 0);
1737 if (!arg_expose_ports)
1740 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1742 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1744 cmsg = CMSG_FIRSTHDR(&mh);
1745 assert(cmsg->cmsg_level == SOL_SOCKET);
1746 assert(cmsg->cmsg_type == SCM_RIGHTS);
1747 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1748 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1750 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1753 return log_error_errno(r, "Failed to create rtnl object: %m");
1756 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1758 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1760 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1762 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1764 r = sd_rtnl_attach_event(rtnl, event, 0);
1766 return log_error_errno(r, "Failed to add to even loop: %m");
1774 static int setup_hostname(void) {
1776 if (arg_share_system)
1779 if (sethostname_idempotent(arg_machine) < 0)
1785 static int setup_journal(const char *directory) {
1786 sd_id128_t machine_id, this_id;
1787 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1791 /* Don't link journals in ephemeral mode */
1795 p = strappend(directory, "/etc/machine-id");
1799 r = read_one_line_file(p, &b);
1800 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1803 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1806 if (isempty(id) && arg_link_journal == LINK_AUTO)
1809 /* Verify validity */
1810 r = sd_id128_from_string(id, &machine_id);
1812 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1814 r = sd_id128_get_machine(&this_id);
1816 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1818 if (sd_id128_equal(machine_id, this_id)) {
1819 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1820 "Host and machine ids are equal (%s): refusing to link journals", id);
1821 if (arg_link_journal == LINK_AUTO)
1826 if (arg_link_journal == LINK_NO)
1830 p = strappend("/var/log/journal/", id);
1831 q = strjoin(directory, "/var/log/journal/", id, NULL);
1835 if (path_is_mount_point(p, false) > 0) {
1836 if (arg_link_journal != LINK_AUTO) {
1837 log_error("%s: already a mount point, refusing to use for journal", p);
1844 if (path_is_mount_point(q, false) > 0) {
1845 if (arg_link_journal != LINK_AUTO) {
1846 log_error("%s: already a mount point, refusing to use for journal", q);
1853 r = readlink_and_make_absolute(p, &d);
1855 if ((arg_link_journal == LINK_GUEST ||
1856 arg_link_journal == LINK_AUTO) &&
1859 r = mkdir_p(q, 0755);
1861 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1866 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1867 } else if (r == -EINVAL) {
1869 if (arg_link_journal == LINK_GUEST &&
1872 if (errno == ENOTDIR) {
1873 log_error("%s already exists and is neither a symlink nor a directory", p);
1876 log_error_errno(errno, "Failed to remove %s: %m", p);
1880 } else if (r != -ENOENT) {
1881 log_error_errno(errno, "readlink(%s) failed: %m", p);
1885 if (arg_link_journal == LINK_GUEST) {
1887 if (symlink(q, p) < 0) {
1888 if (arg_link_journal_try) {
1889 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1892 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1897 r = mkdir_p(q, 0755);
1899 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1903 if (arg_link_journal == LINK_HOST) {
1904 /* don't create parents here -- if the host doesn't have
1905 * permanent journal set up, don't force it here */
1908 if (arg_link_journal_try) {
1909 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1912 log_error_errno(errno, "Failed to create %s: %m", p);
1917 } else if (access(p, F_OK) < 0)
1920 if (dir_is_empty(q) == 0)
1921 log_warning("%s is not empty, proceeding anyway.", q);
1923 r = mkdir_p(q, 0755);
1925 log_error_errno(errno, "Failed to create %s: %m", q);
1929 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1930 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1935 static int drop_capabilities(void) {
1936 return capability_bounding_set_drop(~arg_retain, false);
1939 static int register_machine(pid_t pid, int local_ifindex) {
1940 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1941 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1947 r = sd_bus_default_system(&bus);
1949 return log_error_errno(r, "Failed to open system bus: %m");
1951 if (arg_keep_unit) {
1952 r = sd_bus_call_method(
1954 "org.freedesktop.machine1",
1955 "/org/freedesktop/machine1",
1956 "org.freedesktop.machine1.Manager",
1957 "RegisterMachineWithNetwork",
1962 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1966 strempty(arg_directory),
1967 local_ifindex > 0 ? 1 : 0, local_ifindex);
1969 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1972 r = sd_bus_message_new_method_call(
1975 "org.freedesktop.machine1",
1976 "/org/freedesktop/machine1",
1977 "org.freedesktop.machine1.Manager",
1978 "CreateMachineWithNetwork");
1980 return bus_log_create_error(r);
1982 r = sd_bus_message_append(
1986 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1990 strempty(arg_directory),
1991 local_ifindex > 0 ? 1 : 0, local_ifindex);
1993 return bus_log_create_error(r);
1995 r = sd_bus_message_open_container(m, 'a', "(sv)");
1997 return bus_log_create_error(r);
1999 if (!isempty(arg_slice)) {
2000 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2002 return bus_log_create_error(r);
2005 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2007 return bus_log_create_error(r);
2009 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2010 /* Allow the container to
2011 * access and create the API
2012 * device nodes, so that
2013 * PrivateDevices= in the
2014 * container can work
2019 "/dev/random", "rwm",
2020 "/dev/urandom", "rwm",
2022 "/dev/net/tun", "rwm",
2023 /* Allow the container
2024 * access to ptys. However,
2026 * container to ever create
2027 * these device nodes. */
2028 "/dev/pts/ptmx", "rw",
2031 return log_error_errno(r, "Failed to add device whitelist: %m");
2033 STRV_FOREACH(i, arg_property) {
2034 r = sd_bus_message_open_container(m, 'r', "sv");
2036 return bus_log_create_error(r);
2038 r = bus_append_unit_property_assignment(m, *i);
2042 r = sd_bus_message_close_container(m);
2044 return bus_log_create_error(r);
2047 r = sd_bus_message_close_container(m);
2049 return bus_log_create_error(r);
2051 r = sd_bus_call(bus, m, 0, &error, NULL);
2055 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2062 static int terminate_machine(pid_t pid) {
2063 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2064 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2065 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2072 r = sd_bus_default_system(&bus);
2074 return log_error_errno(r, "Failed to open system bus: %m");
2076 r = sd_bus_call_method(
2078 "org.freedesktop.machine1",
2079 "/org/freedesktop/machine1",
2080 "org.freedesktop.machine1.Manager",
2087 /* Note that the machine might already have been
2088 * cleaned up automatically, hence don't consider it a
2089 * failure if we cannot get the machine object. */
2090 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2094 r = sd_bus_message_read(reply, "o", &path);
2096 return bus_log_parse_error(r);
2098 r = sd_bus_call_method(
2100 "org.freedesktop.machine1",
2102 "org.freedesktop.machine1.Machine",
2108 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2115 static int reset_audit_loginuid(void) {
2116 _cleanup_free_ char *p = NULL;
2119 if (arg_share_system)
2122 r = read_one_line_file("/proc/self/loginuid", &p);
2126 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2128 /* Already reset? */
2129 if (streq(p, "4294967295"))
2132 r = write_string_file("/proc/self/loginuid", "4294967295");
2134 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2135 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2136 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2137 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2138 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2146 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2147 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2148 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2150 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2156 l = strlen(arg_machine);
2157 sz = sizeof(sd_id128_t) + l;
2163 /* fetch some persistent data unique to the host */
2164 r = sd_id128_get_machine((sd_id128_t*) v);
2168 /* combine with some data unique (on this host) to this
2169 * container instance */
2170 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2173 memcpy(i, &idx, sizeof(idx));
2176 /* Let's hash the host machine ID plus the container name. We
2177 * use a fixed, but originally randomly created hash key here. */
2178 siphash24(result, v, sz, hash_key.bytes);
2180 assert_cc(ETH_ALEN <= sizeof(result));
2181 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2183 /* see eth_random_addr in the kernel */
2184 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2185 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2190 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2191 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2192 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2193 struct ether_addr mac_host, mac_container;
2196 if (!arg_private_network)
2199 if (!arg_network_veth)
2202 /* Use two different interface name prefixes depending whether
2203 * we are in bridge mode or not. */
2204 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2205 arg_network_bridge ? "vb" : "ve", arg_machine);
2207 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2209 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2211 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2213 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2215 r = sd_rtnl_open(&rtnl, 0);
2217 return log_error_errno(r, "Failed to connect to netlink: %m");
2219 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2221 return log_error_errno(r, "Failed to allocate netlink message: %m");
2223 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2225 return log_error_errno(r, "Failed to add netlink interface name: %m");
2227 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2229 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2231 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2233 return log_error_errno(r, "Failed to open netlink container: %m");
2235 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2237 return log_error_errno(r, "Failed to open netlink container: %m");
2239 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2241 return log_error_errno(r, "Failed to open netlink container: %m");
2243 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2245 return log_error_errno(r, "Failed to add netlink interface name: %m");
2247 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2249 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2251 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2253 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2255 r = sd_rtnl_message_close_container(m);
2257 return log_error_errno(r, "Failed to close netlink container: %m");
2259 r = sd_rtnl_message_close_container(m);
2261 return log_error_errno(r, "Failed to close netlink container: %m");
2263 r = sd_rtnl_message_close_container(m);
2265 return log_error_errno(r, "Failed to close netlink container: %m");
2267 r = sd_rtnl_call(rtnl, m, 0, NULL);
2269 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2271 i = (int) if_nametoindex(iface_name);
2273 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2280 static int setup_bridge(const char veth_name[], int *ifi) {
2281 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2282 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2285 if (!arg_private_network)
2288 if (!arg_network_veth)
2291 if (!arg_network_bridge)
2294 bridge = (int) if_nametoindex(arg_network_bridge);
2296 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2300 r = sd_rtnl_open(&rtnl, 0);
2302 return log_error_errno(r, "Failed to connect to netlink: %m");
2304 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2306 return log_error_errno(r, "Failed to allocate netlink message: %m");
2308 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2310 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2312 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2314 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2316 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2318 return log_error_errno(r, "Failed to add netlink master field: %m");
2320 r = sd_rtnl_call(rtnl, m, 0, NULL);
2322 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2327 static int parse_interface(struct udev *udev, const char *name) {
2328 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2329 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2332 ifi = (int) if_nametoindex(name);
2334 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2336 sprintf(ifi_str, "n%i", ifi);
2337 d = udev_device_new_from_device_id(udev, ifi_str);
2339 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2341 if (udev_device_get_is_initialized(d) <= 0) {
2342 log_error("Network interface %s is not initialized yet.", name);
2349 static int move_network_interfaces(pid_t pid) {
2350 _cleanup_udev_unref_ struct udev *udev = NULL;
2351 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2355 if (!arg_private_network)
2358 if (strv_isempty(arg_network_interfaces))
2361 r = sd_rtnl_open(&rtnl, 0);
2363 return log_error_errno(r, "Failed to connect to netlink: %m");
2367 log_error("Failed to connect to udev.");
2371 STRV_FOREACH(i, arg_network_interfaces) {
2372 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2375 ifi = parse_interface(udev, *i);
2379 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2381 return log_error_errno(r, "Failed to allocate netlink message: %m");
2383 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2385 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2387 r = sd_rtnl_call(rtnl, m, 0, NULL);
2389 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2395 static int setup_macvlan(pid_t pid) {
2396 _cleanup_udev_unref_ struct udev *udev = NULL;
2397 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2402 if (!arg_private_network)
2405 if (strv_isempty(arg_network_macvlan))
2408 r = sd_rtnl_open(&rtnl, 0);
2410 return log_error_errno(r, "Failed to connect to netlink: %m");
2414 log_error("Failed to connect to udev.");
2418 STRV_FOREACH(i, arg_network_macvlan) {
2419 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2420 _cleanup_free_ char *n = NULL;
2421 struct ether_addr mac;
2424 ifi = parse_interface(udev, *i);
2428 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2430 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2432 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2434 return log_error_errno(r, "Failed to allocate netlink message: %m");
2436 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2438 return log_error_errno(r, "Failed to add netlink interface index: %m");
2440 n = strappend("mv-", *i);
2444 strshorten(n, IFNAMSIZ-1);
2446 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2448 return log_error_errno(r, "Failed to add netlink interface name: %m");
2450 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2452 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2454 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2456 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2458 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2460 return log_error_errno(r, "Failed to open netlink container: %m");
2462 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2464 return log_error_errno(r, "Failed to open netlink container: %m");
2466 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2468 return log_error_errno(r, "Failed to append macvlan mode: %m");
2470 r = sd_rtnl_message_close_container(m);
2472 return log_error_errno(r, "Failed to close netlink container: %m");
2474 r = sd_rtnl_message_close_container(m);
2476 return log_error_errno(r, "Failed to close netlink container: %m");
2478 r = sd_rtnl_call(rtnl, m, 0, NULL);
2480 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2486 static int setup_ipvlan(pid_t pid) {
2487 _cleanup_udev_unref_ struct udev *udev = NULL;
2488 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2492 if (!arg_private_network)
2495 if (strv_isempty(arg_network_ipvlan))
2498 r = sd_rtnl_open(&rtnl, 0);
2500 return log_error_errno(r, "Failed to connect to netlink: %m");
2504 log_error("Failed to connect to udev.");
2508 STRV_FOREACH(i, arg_network_ipvlan) {
2509 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2510 _cleanup_free_ char *n = NULL;
2513 ifi = parse_interface(udev, *i);
2517 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2519 return log_error_errno(r, "Failed to allocate netlink message: %m");
2521 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2523 return log_error_errno(r, "Failed to add netlink interface index: %m");
2525 n = strappend("iv-", *i);
2529 strshorten(n, IFNAMSIZ-1);
2531 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2533 return log_error_errno(r, "Failed to add netlink interface name: %m");
2535 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2537 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2539 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2541 return log_error_errno(r, "Failed to open netlink container: %m");
2543 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2545 return log_error_errno(r, "Failed to open netlink container: %m");
2547 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2549 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2551 r = sd_rtnl_message_close_container(m);
2553 return log_error_errno(r, "Failed to close netlink container: %m");
2555 r = sd_rtnl_message_close_container(m);
2557 return log_error_errno(r, "Failed to close netlink container: %m");
2559 r = sd_rtnl_call(rtnl, m, 0, NULL);
2561 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2567 static int setup_seccomp(void) {
2570 static const struct {
2571 uint64_t capability;
2574 { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
2575 { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
2576 { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
2577 { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
2578 { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
2579 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
2580 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2581 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2582 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
2585 scmp_filter_ctx seccomp;
2589 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2593 r = seccomp_add_secondary_archs(seccomp);
2595 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2599 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2600 if (arg_retain & (1ULL << blacklist[i].capability))
2603 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2605 continue; /* unknown syscall */
2607 log_error_errno(r, "Failed to block syscall: %m");
2614 Audit is broken in containers, much of the userspace audit
2615 hookup will fail if running inside a container. We don't
2616 care and just turn off creation of audit sockets.
2618 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2619 with EAFNOSUPPORT which audit userspace uses as indication
2620 that audit is disabled in the kernel.
2623 r = seccomp_rule_add(
2625 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2628 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2629 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2631 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2635 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2637 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2641 r = seccomp_load(seccomp);
2643 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2646 seccomp_release(seccomp);
2654 static int setup_propagate(const char *root) {
2657 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2658 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2659 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2660 (void) mkdir_p(p, 0600);
2662 q = strjoina(root, "/run/systemd/nspawn/incoming");
2663 mkdir_parents(q, 0755);
2666 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2667 return log_error_errno(errno, "Failed to install propagation bind mount.");
2669 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2670 return log_error_errno(errno, "Failed to make propagation mount read-only");
2675 static int setup_image(char **device_path, int *loop_nr) {
2676 struct loop_info64 info = {
2677 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2679 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2680 _cleanup_free_ char* loopdev = NULL;
2684 assert(device_path);
2688 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2690 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2692 if (fstat(fd, &st) < 0)
2693 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2695 if (S_ISBLK(st.st_mode)) {
2698 p = strdup(arg_image);
2712 if (!S_ISREG(st.st_mode)) {
2713 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2717 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2719 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2721 nr = ioctl(control, LOOP_CTL_GET_FREE);
2723 return log_error_errno(errno, "Failed to allocate loop device: %m");
2725 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2728 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2730 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2732 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2733 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2736 info.lo_flags |= LO_FLAGS_READ_ONLY;
2738 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2739 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2741 *device_path = loopdev;
2752 #define PARTITION_TABLE_BLURB \
2753 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2754 "type 0x83 that is marked bootable, or a single GPT partition of type " \
2755 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2756 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2757 "to be bootable with systemd-nspawn."
2759 static int dissect_image(
2761 char **root_device, bool *root_device_rw,
2762 char **home_device, bool *home_device_rw,
2763 char **srv_device, bool *srv_device_rw,
2767 int home_nr = -1, srv_nr = -1;
2768 #ifdef GPT_ROOT_NATIVE
2771 #ifdef GPT_ROOT_SECONDARY
2772 int secondary_root_nr = -1;
2774 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2775 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2776 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2777 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2778 _cleanup_udev_unref_ struct udev *udev = NULL;
2779 struct udev_list_entry *first, *item;
2780 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2781 bool is_gpt, is_mbr, multiple_generic = false;
2782 const char *pttype = NULL;
2789 assert(root_device);
2790 assert(home_device);
2795 b = blkid_new_probe();
2800 r = blkid_probe_set_device(b, fd, 0, 0);
2805 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2809 blkid_probe_enable_partitions(b, 1);
2810 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2813 r = blkid_do_safeprobe(b);
2814 if (r == -2 || r == 1) {
2815 log_error("Failed to identify any partition table on\n"
2817 PARTITION_TABLE_BLURB, arg_image);
2819 } else if (r != 0) {
2822 log_error_errno(errno, "Failed to probe: %m");
2826 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2828 is_gpt = streq_ptr(pttype, "gpt");
2829 is_mbr = streq_ptr(pttype, "dos");
2831 if (!is_gpt && !is_mbr) {
2832 log_error("No GPT or MBR partition table discovered on\n"
2834 PARTITION_TABLE_BLURB, arg_image);
2839 pl = blkid_probe_get_partitions(b);
2844 log_error("Failed to list partitions of %s", arg_image);
2852 if (fstat(fd, &st) < 0)
2853 return log_error_errno(errno, "Failed to stat block device: %m");
2855 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2863 log_error("Kernel partitions never appeared.");
2867 e = udev_enumerate_new(udev);
2871 r = udev_enumerate_add_match_parent(e, d);
2875 r = udev_enumerate_scan_devices(e);
2877 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2879 /* Count the partitions enumerated by the kernel */
2881 first = udev_enumerate_get_list_entry(e);
2882 udev_list_entry_foreach(item, first)
2885 /* Count the partitions enumerated by blkid */
2886 m = blkid_partlist_numof_partitions(pl);
2890 log_error("blkid and kernel partition list do not match.");
2896 /* The kernel has probed fewer partitions than
2897 * blkid? Maybe the kernel prober is still
2898 * running or it got EBUSY because udev
2899 * already opened the device. Let's reprobe
2900 * the device, which is a synchronous call
2901 * that waits until probing is complete. */
2903 for (j = 0; j < 20; j++) {
2905 r = ioctl(fd, BLKRRPART, 0);
2908 if (r >= 0 || r != -EBUSY)
2911 /* If something else has the device
2912 * open, such as an udev rule, the
2913 * ioctl will return EBUSY. Since
2914 * there's no way to wait until it
2915 * isn't busy anymore, let's just wait
2916 * a bit, and try again.
2918 * This is really something they
2919 * should fix in the kernel! */
2921 usleep(50 * USEC_PER_MSEC);
2925 return log_error_errno(r, "Failed to reread partition table: %m");
2928 e = udev_enumerate_unref(e);
2931 first = udev_enumerate_get_list_entry(e);
2932 udev_list_entry_foreach(item, first) {
2933 _cleanup_udev_device_unref_ struct udev_device *q;
2935 unsigned long long flags;
2941 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2946 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2950 qn = udev_device_get_devnum(q);
2954 if (st.st_rdev == qn)
2957 node = udev_device_get_devnode(q);
2961 pp = blkid_partlist_devno_to_partition(pl, qn);
2965 flags = blkid_partition_get_flags(pp);
2967 nr = blkid_partition_get_partno(pp);
2975 if (flags & GPT_FLAG_NO_AUTO)
2978 stype = blkid_partition_get_type_string(pp);
2982 if (sd_id128_from_string(stype, &type_id) < 0)
2985 if (sd_id128_equal(type_id, GPT_HOME)) {
2987 if (home && nr >= home_nr)
2991 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2993 r = free_and_strdup(&home, node);
2997 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2999 if (srv && nr >= srv_nr)
3003 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3005 r = free_and_strdup(&srv, node);
3009 #ifdef GPT_ROOT_NATIVE
3010 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3012 if (root && nr >= root_nr)
3016 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3018 r = free_and_strdup(&root, node);
3023 #ifdef GPT_ROOT_SECONDARY
3024 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3026 if (secondary_root && nr >= secondary_root_nr)
3029 secondary_root_nr = nr;
3030 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3032 r = free_and_strdup(&secondary_root, node);
3037 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3040 multiple_generic = true;
3042 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3044 r = free_and_strdup(&generic, node);
3050 } else if (is_mbr) {
3053 if (flags != 0x80) /* Bootable flag */
3056 type = blkid_partition_get_type(pp);
3057 if (type != 0x83) /* Linux partition */
3061 multiple_generic = true;
3065 r = free_and_strdup(&root, node);
3073 *root_device = root;
3076 *root_device_rw = root_rw;
3078 } else if (secondary_root) {
3079 *root_device = secondary_root;
3080 secondary_root = NULL;
3082 *root_device_rw = secondary_root_rw;
3084 } else if (generic) {
3086 /* There were no partitions with precise meanings
3087 * around, but we found generic partitions. In this
3088 * case, if there's only one, we can go ahead and boot
3089 * it, otherwise we bail out, because we really cannot
3090 * make any sense of it. */
3092 if (multiple_generic) {
3093 log_error("Identified multiple bootable Linux partitions on\n"
3095 PARTITION_TABLE_BLURB, arg_image);
3099 *root_device = generic;
3102 *root_device_rw = generic_rw;
3105 log_error("Failed to identify root partition in disk image\n"
3107 PARTITION_TABLE_BLURB, arg_image);
3112 *home_device = home;
3115 *home_device_rw = home_rw;
3122 *srv_device_rw = srv_rw;
3127 log_error("--image= is not supported, compiled without blkid support.");
3132 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3134 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3135 const char *fstype, *p;
3145 p = strjoina(where, directory);
3150 b = blkid_new_probe_from_filename(what);
3154 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3158 blkid_probe_enable_superblocks(b, 1);
3159 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3162 r = blkid_do_safeprobe(b);
3163 if (r == -1 || r == 1) {
3164 log_error("Cannot determine file system type of %s", what);
3166 } else if (r != 0) {
3169 log_error_errno(errno, "Failed to probe %s: %m", what);
3174 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3177 log_error("Failed to determine file system type of %s", what);
3181 if (streq(fstype, "crypto_LUKS")) {
3182 log_error("nspawn currently does not support LUKS disk images.");
3186 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3187 return log_error_errno(errno, "Failed to mount %s: %m", what);
3191 log_error("--image= is not supported, compiled without blkid support.");
3196 static int mount_devices(
3198 const char *root_device, bool root_device_rw,
3199 const char *home_device, bool home_device_rw,
3200 const char *srv_device, bool srv_device_rw) {
3206 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3208 return log_error_errno(r, "Failed to mount root directory: %m");
3212 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3214 return log_error_errno(r, "Failed to mount home directory: %m");
3218 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3220 return log_error_errno(r, "Failed to mount server data directory: %m");
3226 static void loop_remove(int nr, int *image_fd) {
3227 _cleanup_close_ int control = -1;
3233 if (image_fd && *image_fd >= 0) {
3234 r = ioctl(*image_fd, LOOP_CLR_FD);
3236 log_debug_errno(errno, "Failed to close loop image: %m");
3237 *image_fd = safe_close(*image_fd);
3240 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3242 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3246 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3248 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3251 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3259 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3260 return log_error_errno(errno, "Failed to allocate pipe: %m");
3264 return log_error_errno(errno, "Failed to fork getent child: %m");
3265 else if (pid == 0) {
3267 char *empty_env = NULL;
3269 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3270 _exit(EXIT_FAILURE);
3272 if (pipe_fds[0] > 2)
3273 safe_close(pipe_fds[0]);
3274 if (pipe_fds[1] > 2)
3275 safe_close(pipe_fds[1]);
3277 nullfd = open("/dev/null", O_RDWR);
3279 _exit(EXIT_FAILURE);
3281 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3282 _exit(EXIT_FAILURE);
3284 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3285 _exit(EXIT_FAILURE);
3290 reset_all_signal_handlers();
3291 close_all_fds(NULL, 0);
3293 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3294 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3295 _exit(EXIT_FAILURE);
3298 pipe_fds[1] = safe_close(pipe_fds[1]);
3305 static int change_uid_gid(char **_home) {
3306 char line[LINE_MAX], *x, *u, *g, *h;
3307 const char *word, *state;
3308 _cleanup_free_ uid_t *uids = NULL;
3309 _cleanup_free_ char *home = NULL;
3310 _cleanup_fclose_ FILE *f = NULL;
3311 _cleanup_close_ int fd = -1;
3312 unsigned n_uids = 0;
3321 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3322 /* Reset everything fully to 0, just in case */
3324 if (setgroups(0, NULL) < 0)
3325 return log_error_errno(errno, "setgroups() failed: %m");
3327 if (setresgid(0, 0, 0) < 0)
3328 return log_error_errno(errno, "setregid() failed: %m");
3330 if (setresuid(0, 0, 0) < 0)
3331 return log_error_errno(errno, "setreuid() failed: %m");
3337 /* First, get user credentials */
3338 fd = spawn_getent("passwd", arg_user, &pid);
3342 f = fdopen(fd, "r");
3347 if (!fgets(line, sizeof(line), f)) {
3350 log_error("Failed to resolve user %s.", arg_user);
3354 log_error_errno(errno, "Failed to read from getent: %m");
3360 wait_for_terminate_and_warn("getent passwd", pid, true);
3362 x = strchr(line, ':');
3364 log_error("/etc/passwd entry has invalid user field.");
3368 u = strchr(x+1, ':');
3370 log_error("/etc/passwd entry has invalid password field.");
3377 log_error("/etc/passwd entry has invalid UID field.");
3385 log_error("/etc/passwd entry has invalid GID field.");
3390 h = strchr(x+1, ':');
3392 log_error("/etc/passwd entry has invalid GECOS field.");
3399 log_error("/etc/passwd entry has invalid home directory field.");
3405 r = parse_uid(u, &uid);
3407 log_error("Failed to parse UID of user.");
3411 r = parse_gid(g, &gid);
3413 log_error("Failed to parse GID of user.");
3421 /* Second, get group memberships */
3422 fd = spawn_getent("initgroups", arg_user, &pid);
3427 f = fdopen(fd, "r");
3432 if (!fgets(line, sizeof(line), f)) {
3434 log_error("Failed to resolve user %s.", arg_user);
3438 log_error_errno(errno, "Failed to read from getent: %m");
3444 wait_for_terminate_and_warn("getent initgroups", pid, true);
3446 /* Skip over the username and subsequent separator whitespace */
3448 x += strcspn(x, WHITESPACE);
3449 x += strspn(x, WHITESPACE);
3451 FOREACH_WORD(word, l, x, state) {
3457 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3460 r = parse_uid(c, &uids[n_uids++]);
3462 log_error("Failed to parse group data from getent.");
3467 r = mkdir_parents(home, 0775);
3469 return log_error_errno(r, "Failed to make home root directory: %m");
3471 r = mkdir_safe(home, 0755, uid, gid);
3472 if (r < 0 && r != -EEXIST)
3473 return log_error_errno(r, "Failed to make home directory: %m");
3475 fchown(STDIN_FILENO, uid, gid);
3476 fchown(STDOUT_FILENO, uid, gid);
3477 fchown(STDERR_FILENO, uid, gid);
3479 if (setgroups(n_uids, uids) < 0)
3480 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3482 if (setresgid(gid, gid, gid) < 0)
3483 return log_error_errno(errno, "setregid() failed: %m");
3485 if (setresuid(uid, uid, uid) < 0)
3486 return log_error_errno(errno, "setreuid() failed: %m");
3498 * < 0 : wait_for_terminate() failed to get the state of the
3499 * container, the container was terminated by a signal, or
3500 * failed for an unknown reason. No change is made to the
3501 * container argument.
3502 * > 0 : The program executed in the container terminated with an
3503 * error. The exit code of the program executed in the
3504 * container is returned. The container argument has been set
3505 * to CONTAINER_TERMINATED.
3506 * 0 : The container is being rebooted, has been shut down or exited
3507 * successfully. The container argument has been set to either
3508 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3510 * That is, success is indicated by a return value of zero, and an
3511 * error is indicated by a non-zero value.
3513 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3517 r = wait_for_terminate(pid, &status);
3519 return log_warning_errno(r, "Failed to wait for container: %m");
3521 switch (status.si_code) {
3524 if (status.si_status == 0) {
3525 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3528 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3530 *container = CONTAINER_TERMINATED;
3531 return status.si_status;
3534 if (status.si_status == SIGINT) {
3536 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3537 *container = CONTAINER_TERMINATED;
3540 } else if (status.si_status == SIGHUP) {
3542 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3543 *container = CONTAINER_REBOOTED;
3547 /* CLD_KILLED fallthrough */
3550 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3554 log_error("Container %s failed due to unknown reason.", arg_machine);
3561 static void nop_handler(int sig) {}
3563 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3566 pid = PTR_TO_UINT32(userdata);
3568 if (kill(pid, arg_kill_signal) >= 0) {
3569 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3570 sd_event_source_set_userdata(s, NULL);
3575 sd_event_exit(sd_event_source_get_event(s), 0);
3579 static int determine_names(void) {
3582 if (!arg_image && !arg_directory) {
3584 _cleanup_(image_unrefp) Image *i = NULL;
3586 r = image_find(arg_machine, &i);
3588 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3590 log_error("No image for machine '%s': %m", arg_machine);
3594 if (i->type == IMAGE_RAW)
3595 r = set_sanitized_path(&arg_image, i->path);
3597 r = set_sanitized_path(&arg_directory, i->path);
3599 return log_error_errno(r, "Invalid image directory: %m");
3601 arg_read_only = arg_read_only || i->read_only;
3603 arg_directory = get_current_dir_name();
3605 if (!arg_directory && !arg_machine) {
3606 log_error("Failed to determine path, please use -D or -i.");
3612 if (arg_directory && path_equal(arg_directory, "/"))
3613 arg_machine = gethostname_malloc();
3615 arg_machine = strdup(basename(arg_image ?: arg_directory));
3620 hostname_cleanup(arg_machine, false);
3621 if (!machine_name_is_valid(arg_machine)) {
3622 log_error("Failed to determine machine name automatically, please use -M.");
3626 if (arg_ephemeral) {
3629 /* Add a random suffix when this is an
3630 * ephemeral machine, so that we can run many
3631 * instances at once without manually having
3632 * to specify -M each time. */
3634 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3645 static int determine_uid_shift(void) {
3651 if (arg_uid_shift == UID_INVALID) {
3654 r = stat(arg_directory, &st);
3656 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3658 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3660 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3661 log_error("UID and GID base of %s don't match.", arg_directory);
3665 arg_uid_range = UINT32_C(0x10000);
3668 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3669 log_error("UID base too high for UID range.");
3673 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3677 int main(int argc, char *argv[]) {
3679 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3680 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3681 _cleanup_close_ int master = -1, image_fd = -1;
3682 _cleanup_fdset_free_ FDSet *fds = NULL;
3683 int r, n_fd_passed, loop_nr = -1;
3684 char veth_name[IFNAMSIZ];
3685 bool secondary = false, remove_subvol = false;
3686 sigset_t mask, mask_chld;
3688 int ret = EXIT_SUCCESS;
3689 union in_addr_union exposed = {};
3690 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3693 log_parse_environment();
3696 r = parse_argv(argc, argv);
3700 r = determine_names();
3704 if (geteuid() != 0) {
3705 log_error("Need to be root.");
3710 if (sd_booted() <= 0) {
3711 log_error("Not running on a systemd system.");
3717 n_fd_passed = sd_listen_fds(false);
3718 if (n_fd_passed > 0) {
3719 r = fdset_new_listen_fds(&fds, false);
3721 log_error_errno(r, "Failed to collect file descriptors: %m");
3725 fdset_close_others(fds);
3728 if (arg_directory) {
3731 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3732 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3737 if (arg_ephemeral) {
3740 /* If the specified path is a mount point we
3741 * generate the new snapshot immediately
3742 * inside it under a random name. However if
3743 * the specified is not a mount point we
3744 * create the new snapshot in the parent
3745 * directory, just next to it. */
3746 r = path_is_mount_point(arg_directory, false);
3748 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3752 r = tempfn_random_child(arg_directory, &np);
3754 r = tempfn_random(arg_directory, &np);
3756 log_error_errno(r, "Failed to generate name for snapshot: %m");
3760 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3762 log_error_errno(r, "Failed to lock %s: %m", np);
3766 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3769 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3773 free(arg_directory);
3776 remove_subvol = true;
3779 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3781 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3785 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3790 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3793 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3795 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3799 log_info("Populated %s from template %s.", arg_directory, arg_template);
3805 if (path_is_os_tree(arg_directory) <= 0) {
3806 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3813 p = strjoina(arg_directory,
3814 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3815 if (access(p, F_OK) < 0) {
3816 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3823 char template[] = "/tmp/nspawn-root-XXXXXX";
3826 assert(!arg_template);
3828 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3830 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3834 r = log_error_errno(r, "Failed to create image lock: %m");
3838 if (!mkdtemp(template)) {
3839 log_error_errno(errno, "Failed to create temporary directory: %m");
3844 arg_directory = strdup(template);
3845 if (!arg_directory) {
3850 image_fd = setup_image(&device_path, &loop_nr);
3856 r = dissect_image(image_fd,
3857 &root_device, &root_device_rw,
3858 &home_device, &home_device_rw,
3859 &srv_device, &srv_device_rw,
3865 r = determine_uid_shift();
3869 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3871 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3873 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3877 r = ptsname_malloc(master, &console);
3879 r = log_error_errno(r, "Failed to determine tty name: %m");
3883 if (unlockpt(master) < 0) {
3884 r = log_error_errno(errno, "Failed to unlock tty: %m");
3889 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3890 arg_machine, arg_image ?: arg_directory);
3892 assert_se(sigemptyset(&mask) == 0);
3893 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3894 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3896 assert_se(sigemptyset(&mask_chld) == 0);
3897 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3900 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3901 ContainerStatus container_status;
3902 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3903 struct sigaction sa = {
3904 .sa_handler = nop_handler,
3905 .sa_flags = SA_NOCLDSTOP,
3908 r = barrier_create(&barrier);
3910 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3914 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3915 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3919 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3920 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3924 /* Child can be killed before execv(), so handle SIGCHLD
3925 * in order to interrupt parent's blocking calls and
3926 * give it a chance to call wait() and terminate. */
3927 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3929 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3933 r = sigaction(SIGCHLD, &sa, NULL);
3935 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3939 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3940 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3941 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3943 if (errno == EINVAL)
3944 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3946 r = log_error_errno(errno, "clone() failed: %m");
3953 _cleanup_free_ char *home = NULL;
3955 const char *envp[] = {
3956 "PATH=" DEFAULT_PATH_SPLIT_USR,
3957 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3962 NULL, /* container_uuid */
3963 NULL, /* LISTEN_FDS */
3964 NULL, /* LISTEN_PID */
3969 barrier_set_role(&barrier, BARRIER_CHILD);
3971 envp[n_env] = strv_find_prefix(environ, "TERM=");
3975 master = safe_close(master);
3977 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3978 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3980 reset_all_signal_handlers();
3981 reset_signal_mask();
3984 close_nointr(STDIN_FILENO);
3985 close_nointr(STDOUT_FILENO);
3986 close_nointr(STDERR_FILENO);
3988 r = open_terminal(console, O_RDWR);
3989 if (r != STDIN_FILENO) {
3995 log_error_errno(r, "Failed to open console: %m");
3996 _exit(EXIT_FAILURE);
3999 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4000 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4001 log_error_errno(errno, "Failed to duplicate console: %m");
4002 _exit(EXIT_FAILURE);
4007 log_error_errno(errno, "setsid() failed: %m");
4008 _exit(EXIT_FAILURE);
4011 if (reset_audit_loginuid() < 0)
4012 _exit(EXIT_FAILURE);
4014 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4015 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4016 _exit(EXIT_FAILURE);
4019 if (arg_private_network)
4022 /* Mark everything as slave, so that we still
4023 * receive mounts from the real root, but don't
4024 * propagate mounts to the real root. */
4025 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4026 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4027 _exit(EXIT_FAILURE);
4030 if (mount_devices(arg_directory,
4031 root_device, root_device_rw,
4032 home_device, home_device_rw,
4033 srv_device, srv_device_rw) < 0)
4034 _exit(EXIT_FAILURE);
4036 /* Turn directory into bind mount */
4037 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4038 log_error_errno(errno, "Failed to make bind mount: %m");
4039 _exit(EXIT_FAILURE);
4042 r = setup_volatile(arg_directory);
4044 _exit(EXIT_FAILURE);
4046 if (setup_volatile_state(arg_directory) < 0)
4047 _exit(EXIT_FAILURE);
4049 r = base_filesystem_create(arg_directory);
4051 _exit(EXIT_FAILURE);
4053 if (arg_read_only) {
4054 r = bind_remount_recursive(arg_directory, true);
4056 log_error_errno(r, "Failed to make tree read-only: %m");
4057 _exit(EXIT_FAILURE);
4061 if (mount_all(arg_directory) < 0)
4062 _exit(EXIT_FAILURE);
4064 if (copy_devnodes(arg_directory) < 0)
4065 _exit(EXIT_FAILURE);
4067 if (setup_ptmx(arg_directory) < 0)
4068 _exit(EXIT_FAILURE);
4070 dev_setup(arg_directory);
4072 if (setup_propagate(arg_directory) < 0)
4073 _exit(EXIT_FAILURE);
4075 if (setup_seccomp() < 0)
4076 _exit(EXIT_FAILURE);
4078 if (setup_dev_console(arg_directory, console) < 0)
4079 _exit(EXIT_FAILURE);
4081 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4082 _exit(EXIT_FAILURE);
4083 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4085 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4086 _exit(EXIT_FAILURE);
4087 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4089 /* Tell the parent that we are ready, and that
4090 * it can cgroupify us to that we lack access
4091 * to certain devices and resources. */
4092 (void) barrier_place(&barrier); /* #1 */
4094 if (setup_boot_id(arg_directory) < 0)
4095 _exit(EXIT_FAILURE);
4097 if (setup_timezone(arg_directory) < 0)
4098 _exit(EXIT_FAILURE);
4100 if (setup_resolv_conf(arg_directory) < 0)
4101 _exit(EXIT_FAILURE);
4103 if (setup_journal(arg_directory) < 0)
4104 _exit(EXIT_FAILURE);
4106 if (mount_binds(arg_directory, arg_bind, false) < 0)
4107 _exit(EXIT_FAILURE);
4109 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4110 _exit(EXIT_FAILURE);
4112 if (mount_tmpfs(arg_directory) < 0)
4113 _exit(EXIT_FAILURE);
4115 /* Wait until we are cgroup-ified, so that we
4116 * can mount the right cgroup path writable */
4117 (void) barrier_place_and_sync(&barrier); /* #2 */
4119 if (mount_cgroup(arg_directory) < 0)
4120 _exit(EXIT_FAILURE);
4122 if (chdir(arg_directory) < 0) {
4123 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4124 _exit(EXIT_FAILURE);
4127 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4128 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4129 _exit(EXIT_FAILURE);
4132 if (chroot(".") < 0) {
4133 log_error_errno(errno, "chroot() failed: %m");
4134 _exit(EXIT_FAILURE);
4137 if (chdir("/") < 0) {
4138 log_error_errno(errno, "chdir() failed: %m");
4139 _exit(EXIT_FAILURE);
4143 if (unshare(CLONE_NEWUSER) < 0) {
4144 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4145 _exit(EXIT_FAILURE);
4148 /* Tell the parent, that it now can
4149 * write the UID map. */
4150 (void) barrier_place(&barrier); /* #3 */
4152 /* Wait until the parent wrote the UID
4154 (void) barrier_place_and_sync(&barrier); /* #4 */
4159 if (drop_capabilities() < 0) {
4160 log_error_errno(errno, "drop_capabilities() failed: %m");
4161 _exit(EXIT_FAILURE);
4166 if (arg_personality != 0xffffffffLU) {
4167 if (personality(arg_personality) < 0) {
4168 log_error_errno(errno, "personality() failed: %m");
4169 _exit(EXIT_FAILURE);
4171 } else if (secondary) {
4172 if (personality(PER_LINUX32) < 0) {
4173 log_error_errno(errno, "personality() failed: %m");
4174 _exit(EXIT_FAILURE);
4179 if (arg_selinux_context)
4180 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4181 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4182 _exit(EXIT_FAILURE);
4186 r = change_uid_gid(&home);
4188 _exit(EXIT_FAILURE);
4190 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4191 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4192 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4194 _exit(EXIT_FAILURE);
4197 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4200 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4202 _exit(EXIT_FAILURE);
4206 if (fdset_size(fds) > 0) {
4207 r = fdset_cloexec(fds, false);
4209 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4210 _exit(EXIT_FAILURE);
4213 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4214 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4216 _exit(EXIT_FAILURE);
4220 if (!strv_isempty(arg_setenv)) {
4223 n = strv_env_merge(2, envp, arg_setenv);
4226 _exit(EXIT_FAILURE);
4231 env_use = (char**) envp;
4233 /* Let the parent know that we are ready and
4234 * wait until the parent is ready with the
4236 (void) barrier_place_and_sync(&barrier); /* #5 */
4242 /* Automatically search for the init system */
4244 l = 1 + argc - optind;
4245 a = newa(char*, l + 1);
4246 memcpy(a + 1, argv + optind, l * sizeof(char*));
4248 a[0] = (char*) "/usr/lib/systemd/systemd";
4249 execve(a[0], a, env_use);
4251 a[0] = (char*) "/lib/systemd/systemd";
4252 execve(a[0], a, env_use);
4254 a[0] = (char*) "/sbin/init";
4255 execve(a[0], a, env_use);
4256 } else if (argc > optind)
4257 execvpe(argv[optind], argv + optind, env_use);
4259 chdir(home ? home : "/root");
4260 execle("/bin/bash", "-bash", NULL, env_use);
4261 execle("/bin/sh", "-sh", NULL, env_use);
4264 log_error_errno(errno, "execv() failed: %m");
4265 _exit(EXIT_FAILURE);
4268 barrier_set_role(&barrier, BARRIER_PARENT);
4272 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4273 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4275 (void) barrier_place(&barrier); /* #1 */
4277 /* Wait for the most basic Child-setup to be done,
4278 * before we add hardware to it, and place it in a
4280 if (barrier_sync(&barrier)) { /* #1 */
4283 r = move_network_interfaces(pid);
4287 r = setup_veth(pid, veth_name, &ifi);
4291 r = setup_bridge(veth_name, &ifi);
4295 r = setup_macvlan(pid);
4299 r = setup_ipvlan(pid);
4303 r = register_machine(pid, ifi);
4307 /* Notify the child that the parent is ready with all
4308 * its setup, and that the child can now hand over
4309 * control to the code to run inside the container. */
4310 (void) barrier_place(&barrier); /* #2 */
4313 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4315 (void) barrier_place_and_sync(&barrier); /* #3 */
4317 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4318 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4319 r = write_string_file(uid_map, line);
4321 log_error_errno(r, "Failed to write UID map: %m");
4325 /* We always assign the same UID and GID ranges */
4326 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4327 r = write_string_file(uid_map, line);
4329 log_error_errno(r, "Failed to write GID map: %m");
4333 (void) barrier_place(&barrier); /* #4 */
4336 /* Block SIGCHLD here, before notifying child.
4337 * process_pty() will handle it with the other signals. */
4338 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4342 /* Reset signal to default */
4343 r = default_signals(SIGCHLD, -1);
4347 /* Let the child know that we are ready and wait that the child is completely ready now. */
4348 if (barrier_place_and_sync(&barrier)) { /* #5 */
4349 _cleanup_event_unref_ sd_event *event = NULL;
4350 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4351 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4356 "STATUS=Container running.\n"
4357 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4359 r = sd_event_new(&event);
4361 log_error_errno(r, "Failed to get default event source: %m");
4365 if (arg_kill_signal > 0) {
4366 /* Try to kill the init system on SIGINT or SIGTERM */
4367 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4368 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4370 /* Immediately exit */
4371 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4372 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4375 /* simply exit on sigchld */
4376 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4378 if (arg_expose_ports) {
4379 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4383 (void) expose_ports(rtnl, &exposed);
4386 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4388 r = pty_forward_new(event, master, true, !interactive, &forward);
4390 log_error_errno(r, "Failed to create PTY forwarder: %m");
4394 r = sd_event_loop(event);
4396 log_error_errno(r, "Failed to run event loop: %m");
4400 pty_forward_get_last_char(forward, &last_char);
4402 forward = pty_forward_free(forward);
4404 if (!arg_quiet && last_char != '\n')
4407 /* Kill if it is not dead yet anyway */
4408 terminate_machine(pid);
4412 /* Normally redundant, but better safe than sorry */
4415 r = wait_for_container(pid, &container_status);
4419 /* We failed to wait for the container, or the
4420 * container exited abnormally */
4422 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4423 /* The container exited with a non-zero
4424 * status, or with zero status and no reboot
4430 /* CONTAINER_REBOOTED, loop again */
4432 if (arg_keep_unit) {
4433 /* Special handling if we are running as a
4434 * service: instead of simply restarting the
4435 * machine we want to restart the entire
4436 * service, so let's inform systemd about this
4437 * with the special exit code 133. The service
4438 * file uses RestartForceExitStatus=133 so
4439 * that this results in a full nspawn
4440 * restart. This is necessary since we might
4441 * have cgroup parameters set we want to have
4448 flush_ports(&exposed);
4454 "STATUS=Terminating...");
4456 loop_remove(loop_nr, &image_fd);
4461 if (remove_subvol && arg_directory) {
4464 k = btrfs_subvol_remove(arg_directory);
4466 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4472 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4473 (void) rm_rf(p, false, true, false);
4476 free(arg_directory);
4481 strv_free(arg_setenv);
4482 strv_free(arg_network_interfaces);
4483 strv_free(arg_network_macvlan);
4484 strv_free(arg_network_ipvlan);
4485 strv_free(arg_bind);
4486 strv_free(arg_bind_ro);
4487 strv_free(arg_tmpfs);
4489 flush_ports(&exposed);
4491 while (arg_expose_ports) {
4492 ExposePort *p = arg_expose_ports;
4493 LIST_REMOVE(ports, arg_expose_ports, p);
4497 return r < 0 ? EXIT_FAILURE : ret;