1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
64 #include "cgroup-util.h"
66 #include "path-util.h"
67 #include "loopback-setup.h"
68 #include "dev-setup.h"
73 #include "bus-error.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
78 #include "blkid-util.h"
80 #include "siphash24.h"
82 #include "base-filesystem.h"
84 #include "event-util.h"
85 #include "capability.h"
87 #include "btrfs-util.h"
88 #include "machine-image.h"
90 #include "in-addr-util.h"
92 #include "local-addresses.h"
95 #include "seccomp-util.h"
98 typedef struct ExposePort {
101 uint16_t container_port;
102 LIST_FIELDS(struct ExposePort, ports);
105 typedef enum ContainerStatus {
106 CONTAINER_TERMINATED,
110 typedef enum LinkJournal {
117 typedef enum Volatile {
123 static char *arg_directory = NULL;
124 static char *arg_template = NULL;
125 static char *arg_user = NULL;
126 static sd_id128_t arg_uuid = {};
127 static char *arg_machine = NULL;
128 static const char *arg_selinux_context = NULL;
129 static const char *arg_selinux_apifs_context = NULL;
130 static const char *arg_slice = NULL;
131 static bool arg_private_network = false;
132 static bool arg_read_only = false;
133 static bool arg_boot = false;
134 static bool arg_ephemeral = false;
135 static LinkJournal arg_link_journal = LINK_AUTO;
136 static bool arg_link_journal_try = false;
137 static uint64_t arg_retain =
138 (1ULL << CAP_CHOWN) |
139 (1ULL << CAP_DAC_OVERRIDE) |
140 (1ULL << CAP_DAC_READ_SEARCH) |
141 (1ULL << CAP_FOWNER) |
142 (1ULL << CAP_FSETID) |
143 (1ULL << CAP_IPC_OWNER) |
145 (1ULL << CAP_LEASE) |
146 (1ULL << CAP_LINUX_IMMUTABLE) |
147 (1ULL << CAP_NET_BIND_SERVICE) |
148 (1ULL << CAP_NET_BROADCAST) |
149 (1ULL << CAP_NET_RAW) |
150 (1ULL << CAP_SETGID) |
151 (1ULL << CAP_SETFCAP) |
152 (1ULL << CAP_SETPCAP) |
153 (1ULL << CAP_SETUID) |
154 (1ULL << CAP_SYS_ADMIN) |
155 (1ULL << CAP_SYS_CHROOT) |
156 (1ULL << CAP_SYS_NICE) |
157 (1ULL << CAP_SYS_PTRACE) |
158 (1ULL << CAP_SYS_TTY_CONFIG) |
159 (1ULL << CAP_SYS_RESOURCE) |
160 (1ULL << CAP_SYS_BOOT) |
161 (1ULL << CAP_AUDIT_WRITE) |
162 (1ULL << CAP_AUDIT_CONTROL) |
164 static char **arg_bind = NULL;
165 static char **arg_bind_ro = NULL;
166 static char **arg_tmpfs = NULL;
167 static char **arg_setenv = NULL;
168 static bool arg_quiet = false;
169 static bool arg_share_system = false;
170 static bool arg_register = true;
171 static bool arg_keep_unit = false;
172 static char **arg_network_interfaces = NULL;
173 static char **arg_network_macvlan = NULL;
174 static char **arg_network_ipvlan = NULL;
175 static bool arg_network_veth = false;
176 static const char *arg_network_bridge = NULL;
177 static unsigned long arg_personality = 0xffffffffLU;
178 static char *arg_image = NULL;
179 static Volatile arg_volatile = VOLATILE_NO;
180 static ExposePort *arg_expose_ports = NULL;
181 static char **arg_property = NULL;
182 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183 static bool arg_userns = false;
184 static int arg_kill_signal = 0;
186 static void help(void) {
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189 " -h --help Show this help\n"
190 " --version Print version string\n"
191 " -q --quiet Do not show status information\n"
192 " -D --directory=PATH Root directory for the container\n"
193 " --template=PATH Initialize root directory from template directory,\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
200 " -M --machine=NAME Set the machine name for the container\n"
201 " --uuid=UUID Set a specific machine UUID for the container\n"
202 " -S --slice=SLICE Place the container in the specified slice\n"
203 " --property=NAME=VALUE Set scope unit property\n"
204 " --private-network Disable network in container\n"
205 " --network-interface=INTERFACE\n"
206 " Assign an existing network interface to the\n"
208 " --network-macvlan=INTERFACE\n"
209 " Create a macvlan network interface based on an\n"
210 " existing network interface to the container\n"
211 " --network-ipvlan=INTERFACE\n"
212 " Create a ipvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " -n --network-veth Add a virtual ethernet connection between host\n"
216 " --network-bridge=INTERFACE\n"
217 " Add a virtual ethernet connection between host\n"
218 " and container and add it to an existing bridge on\n"
220 " --private-users[=UIDBASE[:NUIDS]]\n"
221 " Run within user namespace\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
240 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
241 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
242 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
243 " --share-system Share system namespaces with host\n"
244 " --register=BOOLEAN Register container as machine\n"
245 " --keep-unit Do not register a scope for the machine, reuse\n"
246 " the service unit nspawn is running in\n"
247 " --volatile[=MODE] Run the system in volatile mode\n"
248 , program_invocation_short_name);
251 static int set_sanitized_path(char **b, const char *path) {
257 p = canonicalize_file_name(path);
262 p = path_make_absolute_cwd(path);
268 *b = path_kill_slashes(p);
272 static int parse_argv(int argc, char *argv[]) {
289 ARG_NETWORK_INTERFACE,
301 static const struct option options[] = {
302 { "help", no_argument, NULL, 'h' },
303 { "version", no_argument, NULL, ARG_VERSION },
304 { "directory", required_argument, NULL, 'D' },
305 { "template", required_argument, NULL, ARG_TEMPLATE },
306 { "ephemeral", no_argument, NULL, 'x' },
307 { "user", required_argument, NULL, 'u' },
308 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
309 { "boot", no_argument, NULL, 'b' },
310 { "uuid", required_argument, NULL, ARG_UUID },
311 { "read-only", no_argument, NULL, ARG_READ_ONLY },
312 { "capability", required_argument, NULL, ARG_CAPABILITY },
313 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
314 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
315 { "bind", required_argument, NULL, ARG_BIND },
316 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
317 { "tmpfs", required_argument, NULL, ARG_TMPFS },
318 { "machine", required_argument, NULL, 'M' },
319 { "slice", required_argument, NULL, 'S' },
320 { "setenv", required_argument, NULL, ARG_SETENV },
321 { "selinux-context", required_argument, NULL, 'Z' },
322 { "selinux-apifs-context", required_argument, NULL, 'L' },
323 { "quiet", no_argument, NULL, 'q' },
324 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
325 { "register", required_argument, NULL, ARG_REGISTER },
326 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
327 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
328 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
329 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
330 { "network-veth", no_argument, NULL, 'n' },
331 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
332 { "personality", required_argument, NULL, ARG_PERSONALITY },
333 { "image", required_argument, NULL, 'i' },
334 { "volatile", optional_argument, NULL, ARG_VOLATILE },
335 { "port", required_argument, NULL, 'p' },
336 { "property", required_argument, NULL, ARG_PROPERTY },
337 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
338 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
343 uint64_t plus = 0, minus = 0;
348 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
357 puts(PACKAGE_STRING);
358 puts(SYSTEMD_FEATURES);
362 r = set_sanitized_path(&arg_directory, optarg);
364 return log_error_errno(r, "Invalid root directory: %m");
369 r = set_sanitized_path(&arg_template, optarg);
371 return log_error_errno(r, "Invalid template directory: %m");
376 r = set_sanitized_path(&arg_image, optarg);
378 return log_error_errno(r, "Invalid image path: %m");
383 arg_ephemeral = true;
388 arg_user = strdup(optarg);
394 case ARG_NETWORK_BRIDGE:
395 arg_network_bridge = optarg;
400 arg_network_veth = true;
401 arg_private_network = true;
404 case ARG_NETWORK_INTERFACE:
405 if (strv_extend(&arg_network_interfaces, optarg) < 0)
408 arg_private_network = true;
411 case ARG_NETWORK_MACVLAN:
412 if (strv_extend(&arg_network_macvlan, optarg) < 0)
415 arg_private_network = true;
418 case ARG_NETWORK_IPVLAN:
419 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
424 case ARG_PRIVATE_NETWORK:
425 arg_private_network = true;
433 r = sd_id128_from_string(optarg, &arg_uuid);
435 log_error("Invalid UUID: %s", optarg);
445 if (isempty(optarg)) {
449 if (!machine_name_is_valid(optarg)) {
450 log_error("Invalid machine name: %s", optarg);
454 r = free_and_strdup(&arg_machine, optarg);
462 arg_selinux_context = optarg;
466 arg_selinux_apifs_context = optarg;
470 arg_read_only = true;
474 case ARG_DROP_CAPABILITY: {
475 const char *state, *word;
478 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
479 _cleanup_free_ char *t;
481 t = strndup(word, length);
485 if (streq(t, "all")) {
486 if (c == ARG_CAPABILITY)
487 plus = (uint64_t) -1;
489 minus = (uint64_t) -1;
493 cap = capability_from_name(t);
495 log_error("Failed to parse capability %s.", t);
499 if (c == ARG_CAPABILITY)
500 plus |= 1ULL << (uint64_t) cap;
502 minus |= 1ULL << (uint64_t) cap;
510 arg_link_journal = LINK_GUEST;
511 arg_link_journal_try = true;
514 case ARG_LINK_JOURNAL:
515 if (streq(optarg, "auto")) {
516 arg_link_journal = LINK_AUTO;
517 arg_link_journal_try = false;
518 } else if (streq(optarg, "no")) {
519 arg_link_journal = LINK_NO;
520 arg_link_journal_try = false;
521 } else if (streq(optarg, "guest")) {
522 arg_link_journal = LINK_GUEST;
523 arg_link_journal_try = false;
524 } else if (streq(optarg, "host")) {
525 arg_link_journal = LINK_HOST;
526 arg_link_journal_try = false;
527 } else if (streq(optarg, "try-guest")) {
528 arg_link_journal = LINK_GUEST;
529 arg_link_journal_try = true;
530 } else if (streq(optarg, "try-host")) {
531 arg_link_journal = LINK_HOST;
532 arg_link_journal_try = true;
534 log_error("Failed to parse link journal mode %s", optarg);
542 _cleanup_free_ char *a = NULL, *b = NULL;
546 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
548 e = strchr(optarg, ':');
550 a = strndup(optarg, e - optarg);
560 if (!path_is_absolute(a) || !path_is_absolute(b)) {
561 log_error("Invalid bind mount specification: %s", optarg);
565 r = strv_extend(x, a);
569 r = strv_extend(x, b);
577 _cleanup_free_ char *a = NULL, *b = NULL;
580 e = strchr(optarg, ':');
582 a = strndup(optarg, e - optarg);
586 b = strdup("mode=0755");
592 if (!path_is_absolute(a)) {
593 log_error("Invalid tmpfs specification: %s", optarg);
597 r = strv_push(&arg_tmpfs, a);
603 r = strv_push(&arg_tmpfs, b);
615 if (!env_assignment_is_valid(optarg)) {
616 log_error("Environment variable assignment '%s' is not valid.", optarg);
620 n = strv_env_set(arg_setenv, optarg);
624 strv_free(arg_setenv);
633 case ARG_SHARE_SYSTEM:
634 arg_share_system = true;
638 r = parse_boolean(optarg);
640 log_error("Failed to parse --register= argument: %s", optarg);
648 arg_keep_unit = true;
651 case ARG_PERSONALITY:
653 arg_personality = personality_from_string(optarg);
654 if (arg_personality == 0xffffffffLU) {
655 log_error("Unknown or unsupported personality '%s'.", optarg);
664 arg_volatile = VOLATILE_YES;
666 r = parse_boolean(optarg);
668 if (streq(optarg, "state"))
669 arg_volatile = VOLATILE_STATE;
671 log_error("Failed to parse --volatile= argument: %s", optarg);
675 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
681 const char *split, *e;
682 uint16_t container_port, host_port;
686 if ((e = startswith(optarg, "tcp:")))
687 protocol = IPPROTO_TCP;
688 else if ((e = startswith(optarg, "udp:")))
689 protocol = IPPROTO_UDP;
692 protocol = IPPROTO_TCP;
695 split = strchr(e, ':');
697 char v[split - e + 1];
699 memcpy(v, e, split - e);
702 r = safe_atou16(v, &host_port);
703 if (r < 0 || host_port <= 0) {
704 log_error("Failed to parse host port: %s", optarg);
708 r = safe_atou16(split + 1, &container_port);
710 r = safe_atou16(e, &container_port);
711 host_port = container_port;
714 if (r < 0 || container_port <= 0) {
715 log_error("Failed to parse host port: %s", optarg);
719 LIST_FOREACH(ports, p, arg_expose_ports) {
720 if (p->protocol == protocol && p->host_port == host_port) {
721 log_error("Duplicate port specification: %s", optarg);
726 p = new(ExposePort, 1);
730 p->protocol = protocol;
731 p->host_port = host_port;
732 p->container_port = container_port;
734 LIST_PREPEND(ports, arg_expose_ports, p);
740 if (strv_extend(&arg_property, optarg) < 0)
745 case ARG_PRIVATE_USERS:
747 _cleanup_free_ char *buffer = NULL;
748 const char *range, *shift;
750 range = strchr(optarg, ':');
752 buffer = strndup(optarg, range - optarg);
758 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
759 log_error("Failed to parse UID range: %s", range);
765 if (parse_uid(shift, &arg_uid_shift) < 0) {
766 log_error("Failed to parse UID: %s", optarg);
774 case ARG_KILL_SIGNAL:
775 arg_kill_signal = signal_from_string_try_harder(optarg);
776 if (arg_kill_signal < 0) {
777 log_error("Cannot parse signal: %s", optarg);
787 assert_not_reached("Unhandled option");
790 if (arg_share_system)
791 arg_register = false;
793 if (arg_boot && arg_share_system) {
794 log_error("--boot and --share-system may not be combined.");
798 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
799 log_error("--keep-unit may not be used when invoked from a user session.");
803 if (arg_directory && arg_image) {
804 log_error("--directory= and --image= may not be combined.");
808 if (arg_template && arg_image) {
809 log_error("--template= and --image= may not be combined.");
813 if (arg_template && !(arg_directory || arg_machine)) {
814 log_error("--template= needs --directory= or --machine=.");
818 if (arg_ephemeral && arg_template) {
819 log_error("--ephemeral and --template= may not be combined.");
823 if (arg_ephemeral && arg_image) {
824 log_error("--ephemeral and --image= may not be combined.");
828 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
829 log_error("--ephemeral and --link-journal= may not be combined.");
833 if (arg_volatile != VOLATILE_NO && arg_read_only) {
834 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
838 if (arg_expose_ports && !arg_private_network) {
839 log_error("Cannot use --port= without private networking.");
843 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
845 if (arg_boot && arg_kill_signal <= 0)
846 arg_kill_signal = SIGRTMIN+3;
851 static int mount_all(const char *dest) {
853 typedef struct MountPoint {
862 static const MountPoint mount_table[] = {
863 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
864 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
865 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
866 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
867 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
868 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
869 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
870 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
871 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
873 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
874 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
881 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
882 _cleanup_free_ char *where = NULL, *options = NULL;
886 where = strjoin(dest, "/", mount_table[k].where, NULL);
890 t = path_is_mount_point(where, true);
892 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
900 /* Skip this entry if it is not a remount. */
901 if (mount_table[k].what && t > 0)
904 t = mkdir_p(where, 0755);
906 if (mount_table[k].fatal) {
907 log_error_errno(t, "Failed to create directory %s: %m", where);
912 log_warning_errno(t, "Failed to create directory %s: %m", where);
918 if (arg_selinux_apifs_context &&
919 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
920 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
927 o = mount_table[k].options;
929 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
930 char *uid_options = NULL;
933 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
935 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
940 o = options = uid_options;
943 if (mount(mount_table[k].what,
946 mount_table[k].flags,
949 if (mount_table[k].fatal) {
950 log_error_errno(errno, "mount(%s) failed: %m", where);
955 log_warning_errno(errno, "mount(%s) failed: %m", where);
962 static int mount_binds(const char *dest, char **l, bool ro) {
965 STRV_FOREACH_PAIR(x, y, l) {
966 _cleanup_free_ char *where = NULL;
967 struct stat source_st, dest_st;
970 if (stat(*x, &source_st) < 0)
971 return log_error_errno(errno, "Failed to stat %s: %m", *x);
973 where = strappend(dest, *y);
977 r = stat(where, &dest_st);
979 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
980 log_error("Cannot bind mount directory %s on file %s.", *x, where);
983 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
984 log_error("Cannot bind mount file %s on directory %s.", *x, where);
987 } else if (errno == ENOENT) {
988 r = mkdir_parents_label(where, 0755);
990 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
992 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
996 /* Create the mount point. Any non-directory file can be
997 * mounted on any non-directory file (regular, fifo, socket,
1000 if (S_ISDIR(source_st.st_mode)) {
1001 r = mkdir_label(where, 0755);
1002 if (r < 0 && errno != EEXIST)
1003 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1007 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1010 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1011 return log_error_errno(errno, "mount(%s) failed: %m", where);
1014 r = bind_remount_recursive(where, true);
1016 return log_error_errno(r, "Read-Only bind mount failed: %m");
1023 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1027 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1029 r = path_is_mount_point(to, false);
1031 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1037 /* The superblock mount options of the mount point need to be
1038 * identical to the hosts', and hence writable... */
1039 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1040 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1042 /* ... hence let's only make the bind mount read-only, not the
1045 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1046 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1051 static int mount_cgroup(const char *dest) {
1052 _cleanup_set_free_free_ Set *controllers = NULL;
1053 _cleanup_free_ char *own_cgroup_path = NULL;
1054 const char *cgroup_root, *systemd_root, *systemd_own;
1057 controllers = set_new(&string_hash_ops);
1061 r = cg_kernel_controllers(controllers);
1063 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1065 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1067 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1069 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1070 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1071 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1074 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1076 controller = set_steal_first(controllers);
1080 origin = strappend("/sys/fs/cgroup/", controller);
1084 r = readlink_malloc(origin, &combined);
1086 /* Not a symbolic link, but directly a single cgroup hierarchy */
1088 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1093 return log_error_errno(r, "Failed to read link %s: %m", origin);
1095 _cleanup_free_ char *target = NULL;
1097 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1101 /* A symbolic link, a combination of controllers in one hierarchy */
1103 if (!filename_is_valid(combined)) {
1104 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1108 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1112 if (symlink(combined, target) < 0)
1113 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1117 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1121 /* Make our own cgroup a (writable) bind mount */
1122 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1123 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1124 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1126 /* And then remount the systemd cgroup root read-only */
1127 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1128 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1129 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1131 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1132 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1137 static int mount_tmpfs(const char *dest) {
1140 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1141 _cleanup_free_ char *where = NULL;
1144 where = strappend(dest, *i);
1148 r = mkdir_label(where, 0755);
1149 if (r < 0 && r != -EEXIST)
1150 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1152 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1153 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1159 static int setup_timezone(const char *dest) {
1160 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1166 /* Fix the timezone, if possible */
1167 r = readlink_malloc("/etc/localtime", &p);
1169 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1173 z = path_startswith(p, "../usr/share/zoneinfo/");
1175 z = path_startswith(p, "/usr/share/zoneinfo/");
1177 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1181 where = strappend(dest, "/etc/localtime");
1185 r = readlink_malloc(where, &q);
1187 y = path_startswith(q, "../usr/share/zoneinfo/");
1189 y = path_startswith(q, "/usr/share/zoneinfo/");
1191 /* Already pointing to the right place? Then do nothing .. */
1192 if (y && streq(y, z))
1196 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1200 if (access(check, F_OK) < 0) {
1201 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1205 what = strappend("../usr/share/zoneinfo/", z);
1209 r = mkdir_parents(where, 0755);
1211 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1217 if (r < 0 && errno != ENOENT) {
1218 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1223 if (symlink(what, where) < 0) {
1224 log_error_errno(errno, "Failed to correct timezone of container: %m");
1231 static int setup_resolv_conf(const char *dest) {
1232 _cleanup_free_ char *where = NULL;
1237 if (arg_private_network)
1240 /* Fix resolv.conf, if possible */
1241 where = strappend(dest, "/etc/resolv.conf");
1245 /* We don't really care for the results of this really. If it
1246 * fails, it fails, but meh... */
1247 r = mkdir_parents(where, 0755);
1249 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1254 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1256 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1264 static int setup_volatile_state(const char *directory) {
1270 if (arg_volatile != VOLATILE_STATE)
1273 /* --volatile=state means we simply overmount /var
1274 with a tmpfs, and the rest read-only. */
1276 r = bind_remount_recursive(directory, true);
1278 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1280 p = strjoina(directory, "/var");
1282 if (r < 0 && errno != EEXIST)
1283 return log_error_errno(errno, "Failed to create %s: %m", directory);
1285 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1286 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1291 static int setup_volatile(const char *directory) {
1292 bool tmpfs_mounted = false, bind_mounted = false;
1293 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1299 if (arg_volatile != VOLATILE_YES)
1302 /* --volatile=yes means we mount a tmpfs to the root dir, and
1303 the original /usr to use inside it, and that read-only. */
1305 if (!mkdtemp(template))
1306 return log_error_errno(errno, "Failed to create temporary directory: %m");
1308 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1309 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1314 tmpfs_mounted = true;
1316 f = strjoina(directory, "/usr");
1317 t = strjoina(template, "/usr");
1320 if (r < 0 && errno != EEXIST) {
1321 log_error_errno(errno, "Failed to create %s: %m", t);
1326 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1327 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1332 bind_mounted = true;
1334 r = bind_remount_recursive(t, true);
1336 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1340 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1341 log_error_errno(errno, "Failed to move root mount: %m");
1359 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1362 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1363 SD_ID128_FORMAT_VAL(id));
1368 static int setup_boot_id(const char *dest) {
1369 _cleanup_free_ char *from = NULL, *to = NULL;
1370 sd_id128_t rnd = {};
1376 if (arg_share_system)
1379 /* Generate a new randomized boot ID, so that each boot-up of
1380 * the container gets a new one */
1382 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1383 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1387 r = sd_id128_randomize(&rnd);
1389 return log_error_errno(r, "Failed to generate random boot id: %m");
1391 id128_format_as_uuid(rnd, as_uuid);
1393 r = write_string_file(from, as_uuid);
1395 return log_error_errno(r, "Failed to write boot id: %m");
1397 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1398 log_error_errno(errno, "Failed to bind mount boot id: %m");
1400 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1401 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1407 static int copy_devnodes(const char *dest) {
1409 static const char devnodes[] =
1420 _cleanup_umask_ mode_t u;
1426 NULSTR_FOREACH(d, devnodes) {
1427 _cleanup_free_ char *from = NULL, *to = NULL;
1430 from = strappend("/dev/", d);
1431 to = strjoin(dest, "/dev/", d, NULL);
1435 if (stat(from, &st) < 0) {
1437 if (errno != ENOENT)
1438 return log_error_errno(errno, "Failed to stat %s: %m", from);
1440 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1442 log_error("%s is not a char or block device, cannot copy", from);
1446 r = mkdir_parents(to, 0775);
1448 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1452 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1453 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1455 if (arg_userns && arg_uid_shift != UID_INVALID)
1456 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1457 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1464 static int setup_ptmx(const char *dest) {
1465 _cleanup_free_ char *p = NULL;
1467 p = strappend(dest, "/dev/ptmx");
1471 if (symlink("pts/ptmx", p) < 0)
1472 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1474 if (arg_userns && arg_uid_shift != UID_INVALID)
1475 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1476 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1481 static int setup_dev_console(const char *dest, const char *console) {
1482 _cleanup_umask_ mode_t u;
1492 if (stat("/dev/null", &st) < 0)
1493 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1495 r = chmod_and_chown(console, 0600, 0, 0);
1497 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1499 /* We need to bind mount the right tty to /dev/console since
1500 * ptys can only exist on pts file systems. To have something
1501 * to bind mount things on we create a device node first, and
1502 * use /dev/null for that since we the cgroups device policy
1503 * allows us to create that freely, while we cannot create
1504 * /dev/console. (Note that the major minor doesn't actually
1505 * matter here, since we mount it over anyway). */
1507 to = strjoina(dest, "/dev/console");
1508 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1509 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1511 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1512 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1517 static int setup_kmsg(const char *dest, int kmsg_socket) {
1518 _cleanup_free_ char *from = NULL, *to = NULL;
1519 _cleanup_umask_ mode_t u;
1522 struct cmsghdr cmsghdr;
1523 uint8_t buf[CMSG_SPACE(sizeof(int))];
1525 struct msghdr mh = {
1526 .msg_control = &control,
1527 .msg_controllen = sizeof(control),
1529 struct cmsghdr *cmsg;
1532 assert(kmsg_socket >= 0);
1536 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1537 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1538 * on the reading side behave very similar to /proc/kmsg,
1539 * their writing side behaves differently from /dev/kmsg in
1540 * that writing blocks when nothing is reading. In order to
1541 * avoid any problems with containers deadlocking due to this
1542 * we simply make /dev/kmsg unavailable to the container. */
1543 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1544 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1547 if (mkfifo(from, 0600) < 0)
1548 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1550 r = chmod_and_chown(from, 0600, 0, 0);
1552 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1554 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1555 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1557 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1559 return log_error_errno(errno, "Failed to open fifo: %m");
1561 cmsg = CMSG_FIRSTHDR(&mh);
1562 cmsg->cmsg_level = SOL_SOCKET;
1563 cmsg->cmsg_type = SCM_RIGHTS;
1564 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1565 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1567 mh.msg_controllen = cmsg->cmsg_len;
1569 /* Store away the fd in the socket, so that it stays open as
1570 * long as we run the child */
1571 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1575 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1577 /* And now make the FIFO unavailable as /dev/kmsg... */
1582 static int send_rtnl(int send_fd) {
1584 struct cmsghdr cmsghdr;
1585 uint8_t buf[CMSG_SPACE(sizeof(int))];
1587 struct msghdr mh = {
1588 .msg_control = &control,
1589 .msg_controllen = sizeof(control),
1591 struct cmsghdr *cmsg;
1592 _cleanup_close_ int fd = -1;
1595 assert(send_fd >= 0);
1597 if (!arg_expose_ports)
1600 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1602 return log_error_errno(errno, "failed to allocate container netlink: %m");
1604 cmsg = CMSG_FIRSTHDR(&mh);
1605 cmsg->cmsg_level = SOL_SOCKET;
1606 cmsg->cmsg_type = SCM_RIGHTS;
1607 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1608 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1610 mh.msg_controllen = cmsg->cmsg_len;
1612 /* Store away the fd in the socket, so that it stays open as
1613 * long as we run the child */
1614 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1616 return log_error_errno(errno, "Failed to send netlink fd: %m");
1621 static int flush_ports(union in_addr_union *exposed) {
1623 int r, af = AF_INET;
1627 if (!arg_expose_ports)
1630 if (in_addr_is_null(af, exposed))
1633 log_debug("Lost IP address.");
1635 LIST_FOREACH(ports, p, arg_expose_ports) {
1636 r = fw_add_local_dnat(false,
1647 log_warning_errno(r, "Failed to modify firewall: %m");
1650 *exposed = IN_ADDR_NULL;
1654 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1655 _cleanup_free_ struct local_address *addresses = NULL;
1656 _cleanup_free_ char *pretty = NULL;
1657 union in_addr_union new_exposed;
1660 int af = AF_INET, r;
1664 /* Invoked each time an address is added or removed inside the
1667 if (!arg_expose_ports)
1670 r = local_addresses(rtnl, 0, af, &addresses);
1672 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1675 addresses[0].family == af &&
1676 addresses[0].scope < RT_SCOPE_LINK;
1679 return flush_ports(exposed);
1681 new_exposed = addresses[0].address;
1682 if (in_addr_equal(af, exposed, &new_exposed))
1685 in_addr_to_string(af, &new_exposed, &pretty);
1686 log_debug("New container IP is %s.", strna(pretty));
1688 LIST_FOREACH(ports, p, arg_expose_ports) {
1690 r = fw_add_local_dnat(true,
1699 in_addr_is_null(af, exposed) ? NULL : exposed);
1701 log_warning_errno(r, "Failed to modify firewall: %m");
1704 *exposed = new_exposed;
1708 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1709 union in_addr_union *exposed = userdata;
1715 expose_ports(rtnl, exposed);
1719 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1721 struct cmsghdr cmsghdr;
1722 uint8_t buf[CMSG_SPACE(sizeof(int))];
1724 struct msghdr mh = {
1725 .msg_control = &control,
1726 .msg_controllen = sizeof(control),
1728 struct cmsghdr *cmsg;
1729 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1734 assert(recv_fd >= 0);
1737 if (!arg_expose_ports)
1740 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1742 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1744 cmsg = CMSG_FIRSTHDR(&mh);
1745 assert(cmsg->cmsg_level == SOL_SOCKET);
1746 assert(cmsg->cmsg_type == SCM_RIGHTS);
1747 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1748 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1750 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1753 return log_error_errno(r, "Failed to create rtnl object: %m");
1756 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1758 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1760 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1762 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1764 r = sd_rtnl_attach_event(rtnl, event, 0);
1766 return log_error_errno(r, "Failed to add to even loop: %m");
1774 static int setup_hostname(void) {
1776 if (arg_share_system)
1779 if (sethostname_idempotent(arg_machine) < 0)
1785 static int setup_journal(const char *directory) {
1786 sd_id128_t machine_id, this_id;
1787 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1791 /* Don't link journals in ephemeral mode */
1795 p = strappend(directory, "/etc/machine-id");
1799 r = read_one_line_file(p, &b);
1800 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1803 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1806 if (isempty(id) && arg_link_journal == LINK_AUTO)
1809 /* Verify validity */
1810 r = sd_id128_from_string(id, &machine_id);
1812 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1814 r = sd_id128_get_machine(&this_id);
1816 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1818 if (sd_id128_equal(machine_id, this_id)) {
1819 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1820 "Host and machine ids are equal (%s): refusing to link journals", id);
1821 if (arg_link_journal == LINK_AUTO)
1826 if (arg_link_journal == LINK_NO)
1830 p = strappend("/var/log/journal/", id);
1831 q = strjoin(directory, "/var/log/journal/", id, NULL);
1835 if (path_is_mount_point(p, false) > 0) {
1836 if (arg_link_journal != LINK_AUTO) {
1837 log_error("%s: already a mount point, refusing to use for journal", p);
1844 if (path_is_mount_point(q, false) > 0) {
1845 if (arg_link_journal != LINK_AUTO) {
1846 log_error("%s: already a mount point, refusing to use for journal", q);
1853 r = readlink_and_make_absolute(p, &d);
1855 if ((arg_link_journal == LINK_GUEST ||
1856 arg_link_journal == LINK_AUTO) &&
1859 r = mkdir_p(q, 0755);
1861 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1866 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1867 } else if (r == -EINVAL) {
1869 if (arg_link_journal == LINK_GUEST &&
1872 if (errno == ENOTDIR) {
1873 log_error("%s already exists and is neither a symlink nor a directory", p);
1876 log_error_errno(errno, "Failed to remove %s: %m", p);
1880 } else if (r != -ENOENT) {
1881 log_error_errno(errno, "readlink(%s) failed: %m", p);
1885 if (arg_link_journal == LINK_GUEST) {
1887 if (symlink(q, p) < 0) {
1888 if (arg_link_journal_try) {
1889 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1892 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1897 r = mkdir_p(q, 0755);
1899 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1903 if (arg_link_journal == LINK_HOST) {
1904 /* don't create parents here -- if the host doesn't have
1905 * permanent journal set up, don't force it here */
1908 if (arg_link_journal_try) {
1909 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1912 log_error_errno(errno, "Failed to create %s: %m", p);
1917 } else if (access(p, F_OK) < 0)
1920 if (dir_is_empty(q) == 0)
1921 log_warning("%s is not empty, proceeding anyway.", q);
1923 r = mkdir_p(q, 0755);
1925 log_error_errno(errno, "Failed to create %s: %m", q);
1929 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1930 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1935 static int drop_capabilities(void) {
1936 return capability_bounding_set_drop(~arg_retain, false);
1939 static int register_machine(pid_t pid, int local_ifindex) {
1940 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1941 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1947 r = sd_bus_default_system(&bus);
1949 return log_error_errno(r, "Failed to open system bus: %m");
1951 if (arg_keep_unit) {
1952 r = sd_bus_call_method(
1954 "org.freedesktop.machine1",
1955 "/org/freedesktop/machine1",
1956 "org.freedesktop.machine1.Manager",
1957 "RegisterMachineWithNetwork",
1962 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1966 strempty(arg_directory),
1967 local_ifindex > 0 ? 1 : 0, local_ifindex);
1969 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1972 r = sd_bus_message_new_method_call(
1975 "org.freedesktop.machine1",
1976 "/org/freedesktop/machine1",
1977 "org.freedesktop.machine1.Manager",
1978 "CreateMachineWithNetwork");
1980 return bus_log_create_error(r);
1982 r = sd_bus_message_append(
1986 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1990 strempty(arg_directory),
1991 local_ifindex > 0 ? 1 : 0, local_ifindex);
1993 return bus_log_create_error(r);
1995 r = sd_bus_message_open_container(m, 'a', "(sv)");
1997 return bus_log_create_error(r);
1999 if (!isempty(arg_slice)) {
2000 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2002 return bus_log_create_error(r);
2005 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2007 return bus_log_create_error(r);
2009 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2010 /* Allow the container to
2011 * access and create the API
2012 * device nodes, so that
2013 * PrivateDevices= in the
2014 * container can work
2019 "/dev/random", "rwm",
2020 "/dev/urandom", "rwm",
2022 "/dev/net/tun", "rwm",
2023 /* Allow the container
2024 * access to ptys. However,
2026 * container to ever create
2027 * these device nodes. */
2028 "/dev/pts/ptmx", "rw",
2031 return log_error_errno(r, "Failed to add device whitelist: %m");
2033 STRV_FOREACH(i, arg_property) {
2034 r = sd_bus_message_open_container(m, 'r', "sv");
2036 return bus_log_create_error(r);
2038 r = bus_append_unit_property_assignment(m, *i);
2042 r = sd_bus_message_close_container(m);
2044 return bus_log_create_error(r);
2047 r = sd_bus_message_close_container(m);
2049 return bus_log_create_error(r);
2051 r = sd_bus_call(bus, m, 0, &error, NULL);
2055 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2062 static int terminate_machine(pid_t pid) {
2063 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2064 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2065 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2072 r = sd_bus_default_system(&bus);
2074 return log_error_errno(r, "Failed to open system bus: %m");
2076 r = sd_bus_call_method(
2078 "org.freedesktop.machine1",
2079 "/org/freedesktop/machine1",
2080 "org.freedesktop.machine1.Manager",
2087 /* Note that the machine might already have been
2088 * cleaned up automatically, hence don't consider it a
2089 * failure if we cannot get the machine object. */
2090 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2094 r = sd_bus_message_read(reply, "o", &path);
2096 return bus_log_parse_error(r);
2098 r = sd_bus_call_method(
2100 "org.freedesktop.machine1",
2102 "org.freedesktop.machine1.Machine",
2108 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2115 static int reset_audit_loginuid(void) {
2116 _cleanup_free_ char *p = NULL;
2119 if (arg_share_system)
2122 r = read_one_line_file("/proc/self/loginuid", &p);
2126 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2128 /* Already reset? */
2129 if (streq(p, "4294967295"))
2132 r = write_string_file("/proc/self/loginuid", "4294967295");
2134 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2135 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2136 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2137 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2138 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2146 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2147 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2148 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2150 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2156 l = strlen(arg_machine);
2157 sz = sizeof(sd_id128_t) + l;
2163 /* fetch some persistent data unique to the host */
2164 r = sd_id128_get_machine((sd_id128_t*) v);
2168 /* combine with some data unique (on this host) to this
2169 * container instance */
2170 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2173 memcpy(i, &idx, sizeof(idx));
2176 /* Let's hash the host machine ID plus the container name. We
2177 * use a fixed, but originally randomly created hash key here. */
2178 siphash24(result, v, sz, hash_key.bytes);
2180 assert_cc(ETH_ALEN <= sizeof(result));
2181 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2183 /* see eth_random_addr in the kernel */
2184 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2185 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2190 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2191 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2192 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2193 struct ether_addr mac_host, mac_container;
2196 if (!arg_private_network)
2199 if (!arg_network_veth)
2202 /* Use two different interface name prefixes depending whether
2203 * we are in bridge mode or not. */
2204 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2205 arg_network_bridge ? "vb" : "ve", arg_machine);
2207 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2209 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2211 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2213 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2215 r = sd_rtnl_open(&rtnl, 0);
2217 return log_error_errno(r, "Failed to connect to netlink: %m");
2219 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2221 return log_error_errno(r, "Failed to allocate netlink message: %m");
2223 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2225 return log_error_errno(r, "Failed to add netlink interface name: %m");
2227 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2229 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2231 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2233 return log_error_errno(r, "Failed to open netlink container: %m");
2235 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2237 return log_error_errno(r, "Failed to open netlink container: %m");
2239 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2241 return log_error_errno(r, "Failed to open netlink container: %m");
2243 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2245 return log_error_errno(r, "Failed to add netlink interface name: %m");
2247 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2249 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2251 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2253 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2255 r = sd_rtnl_message_close_container(m);
2257 return log_error_errno(r, "Failed to close netlink container: %m");
2259 r = sd_rtnl_message_close_container(m);
2261 return log_error_errno(r, "Failed to close netlink container: %m");
2263 r = sd_rtnl_message_close_container(m);
2265 return log_error_errno(r, "Failed to close netlink container: %m");
2267 r = sd_rtnl_call(rtnl, m, 0, NULL);
2269 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2271 i = (int) if_nametoindex(iface_name);
2273 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2280 static int setup_bridge(const char veth_name[], int *ifi) {
2281 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2282 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2285 if (!arg_private_network)
2288 if (!arg_network_veth)
2291 if (!arg_network_bridge)
2294 bridge = (int) if_nametoindex(arg_network_bridge);
2296 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2300 r = sd_rtnl_open(&rtnl, 0);
2302 return log_error_errno(r, "Failed to connect to netlink: %m");
2304 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2306 return log_error_errno(r, "Failed to allocate netlink message: %m");
2308 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2310 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2312 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2314 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2316 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2318 return log_error_errno(r, "Failed to add netlink master field: %m");
2320 r = sd_rtnl_call(rtnl, m, 0, NULL);
2322 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2327 static int parse_interface(struct udev *udev, const char *name) {
2328 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2329 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2332 ifi = (int) if_nametoindex(name);
2334 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2336 sprintf(ifi_str, "n%i", ifi);
2337 d = udev_device_new_from_device_id(udev, ifi_str);
2339 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2341 if (udev_device_get_is_initialized(d) <= 0) {
2342 log_error("Network interface %s is not initialized yet.", name);
2349 static int move_network_interfaces(pid_t pid) {
2350 _cleanup_udev_unref_ struct udev *udev = NULL;
2351 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2355 if (!arg_private_network)
2358 if (strv_isempty(arg_network_interfaces))
2361 r = sd_rtnl_open(&rtnl, 0);
2363 return log_error_errno(r, "Failed to connect to netlink: %m");
2367 log_error("Failed to connect to udev.");
2371 STRV_FOREACH(i, arg_network_interfaces) {
2372 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2375 ifi = parse_interface(udev, *i);
2379 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2381 return log_error_errno(r, "Failed to allocate netlink message: %m");
2383 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2385 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2387 r = sd_rtnl_call(rtnl, m, 0, NULL);
2389 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2395 static int setup_macvlan(pid_t pid) {
2396 _cleanup_udev_unref_ struct udev *udev = NULL;
2397 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2402 if (!arg_private_network)
2405 if (strv_isempty(arg_network_macvlan))
2408 r = sd_rtnl_open(&rtnl, 0);
2410 return log_error_errno(r, "Failed to connect to netlink: %m");
2414 log_error("Failed to connect to udev.");
2418 STRV_FOREACH(i, arg_network_macvlan) {
2419 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2420 _cleanup_free_ char *n = NULL;
2421 struct ether_addr mac;
2424 ifi = parse_interface(udev, *i);
2428 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2430 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2432 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2434 return log_error_errno(r, "Failed to allocate netlink message: %m");
2436 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2438 return log_error_errno(r, "Failed to add netlink interface index: %m");
2440 n = strappend("mv-", *i);
2444 strshorten(n, IFNAMSIZ-1);
2446 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2448 return log_error_errno(r, "Failed to add netlink interface name: %m");
2450 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2452 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2454 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2456 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2458 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2460 return log_error_errno(r, "Failed to open netlink container: %m");
2462 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2464 return log_error_errno(r, "Failed to open netlink container: %m");
2466 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2468 return log_error_errno(r, "Failed to append macvlan mode: %m");
2470 r = sd_rtnl_message_close_container(m);
2472 return log_error_errno(r, "Failed to close netlink container: %m");
2474 r = sd_rtnl_message_close_container(m);
2476 return log_error_errno(r, "Failed to close netlink container: %m");
2478 r = sd_rtnl_call(rtnl, m, 0, NULL);
2480 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2486 static int setup_ipvlan(pid_t pid) {
2487 _cleanup_udev_unref_ struct udev *udev = NULL;
2488 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2492 if (!arg_private_network)
2495 if (strv_isempty(arg_network_ipvlan))
2498 r = sd_rtnl_open(&rtnl, 0);
2500 return log_error_errno(r, "Failed to connect to netlink: %m");
2504 log_error("Failed to connect to udev.");
2508 STRV_FOREACH(i, arg_network_ipvlan) {
2509 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2510 _cleanup_free_ char *n = NULL;
2513 ifi = parse_interface(udev, *i);
2517 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2519 return log_error_errno(r, "Failed to allocate netlink message: %m");
2521 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2523 return log_error_errno(r, "Failed to add netlink interface index: %m");
2525 n = strappend("iv-", *i);
2529 strshorten(n, IFNAMSIZ-1);
2531 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2533 return log_error_errno(r, "Failed to add netlink interface name: %m");
2535 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2537 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2539 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2541 return log_error_errno(r, "Failed to open netlink container: %m");
2543 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2545 return log_error_errno(r, "Failed to open netlink container: %m");
2547 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2549 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2551 r = sd_rtnl_message_close_container(m);
2553 return log_error_errno(r, "Failed to close netlink container: %m");
2555 r = sd_rtnl_message_close_container(m);
2557 return log_error_errno(r, "Failed to close netlink container: %m");
2559 r = sd_rtnl_call(rtnl, m, 0, NULL);
2561 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2567 static int setup_seccomp(void) {
2570 static const int blacklist[] = {
2571 SCMP_SYS(kexec_load),
2572 SCMP_SYS(open_by_handle_at),
2579 static const int kmod_blacklist[] = {
2580 SCMP_SYS(init_module),
2581 SCMP_SYS(finit_module),
2582 SCMP_SYS(delete_module),
2585 scmp_filter_ctx seccomp;
2589 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2593 r = seccomp_add_secondary_archs(seccomp);
2595 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2599 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2600 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2602 continue; /* unknown syscall */
2604 log_error_errno(r, "Failed to block syscall: %m");
2609 /* If the CAP_SYS_MODULE capability is not requested then
2610 * we'll block the kmod syscalls too */
2611 if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2612 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2613 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2615 continue; /* unknown syscall */
2617 log_error_errno(r, "Failed to block syscall: %m");
2624 Audit is broken in containers, much of the userspace audit
2625 hookup will fail if running inside a container. We don't
2626 care and just turn off creation of audit sockets.
2628 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2629 with EAFNOSUPPORT which audit userspace uses as indication
2630 that audit is disabled in the kernel.
2633 r = seccomp_rule_add(
2635 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2638 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2639 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2641 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2645 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2647 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2651 r = seccomp_load(seccomp);
2653 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2656 seccomp_release(seccomp);
2664 static int setup_propagate(const char *root) {
2667 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2668 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2669 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2670 (void) mkdir_p(p, 0600);
2672 q = strjoina(root, "/run/systemd/nspawn/incoming");
2673 mkdir_parents(q, 0755);
2676 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2677 return log_error_errno(errno, "Failed to install propagation bind mount.");
2679 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2680 return log_error_errno(errno, "Failed to make propagation mount read-only");
2685 static int setup_image(char **device_path, int *loop_nr) {
2686 struct loop_info64 info = {
2687 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2689 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2690 _cleanup_free_ char* loopdev = NULL;
2694 assert(device_path);
2698 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2700 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2702 if (fstat(fd, &st) < 0)
2703 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2705 if (S_ISBLK(st.st_mode)) {
2708 p = strdup(arg_image);
2722 if (!S_ISREG(st.st_mode)) {
2723 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2727 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2729 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2731 nr = ioctl(control, LOOP_CTL_GET_FREE);
2733 return log_error_errno(errno, "Failed to allocate loop device: %m");
2735 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2738 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2740 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2742 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2743 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2746 info.lo_flags |= LO_FLAGS_READ_ONLY;
2748 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2749 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2751 *device_path = loopdev;
2762 #define PARTITION_TABLE_BLURB \
2763 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2764 "type 0x83 that is marked bootable, or a single GPT partition of type " \
2765 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2766 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2767 "to be bootable with systemd-nspawn."
2769 static int dissect_image(
2771 char **root_device, bool *root_device_rw,
2772 char **home_device, bool *home_device_rw,
2773 char **srv_device, bool *srv_device_rw,
2777 int home_nr = -1, srv_nr = -1;
2778 #ifdef GPT_ROOT_NATIVE
2781 #ifdef GPT_ROOT_SECONDARY
2782 int secondary_root_nr = -1;
2784 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2785 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2786 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2787 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2788 _cleanup_udev_unref_ struct udev *udev = NULL;
2789 struct udev_list_entry *first, *item;
2790 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2791 bool is_gpt, is_mbr, multiple_generic = false;
2792 const char *pttype = NULL;
2799 assert(root_device);
2800 assert(home_device);
2805 b = blkid_new_probe();
2810 r = blkid_probe_set_device(b, fd, 0, 0);
2815 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2819 blkid_probe_enable_partitions(b, 1);
2820 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2823 r = blkid_do_safeprobe(b);
2824 if (r == -2 || r == 1) {
2825 log_error("Failed to identify any partition table on\n"
2827 PARTITION_TABLE_BLURB, arg_image);
2829 } else if (r != 0) {
2832 log_error_errno(errno, "Failed to probe: %m");
2836 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2838 is_gpt = streq_ptr(pttype, "gpt");
2839 is_mbr = streq_ptr(pttype, "dos");
2841 if (!is_gpt && !is_mbr) {
2842 log_error("No GPT or MBR partition table discovered on\n"
2844 PARTITION_TABLE_BLURB, arg_image);
2849 pl = blkid_probe_get_partitions(b);
2854 log_error("Failed to list partitions of %s", arg_image);
2862 if (fstat(fd, &st) < 0)
2863 return log_error_errno(errno, "Failed to stat block device: %m");
2865 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2873 log_error("Kernel partitions never appeared.");
2877 e = udev_enumerate_new(udev);
2881 r = udev_enumerate_add_match_parent(e, d);
2885 r = udev_enumerate_scan_devices(e);
2887 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2889 /* Count the partitions enumerated by the kernel */
2891 first = udev_enumerate_get_list_entry(e);
2892 udev_list_entry_foreach(item, first)
2895 /* Count the partitions enumerated by blkid */
2896 m = blkid_partlist_numof_partitions(pl);
2900 log_error("blkid and kernel partition list do not match.");
2906 /* The kernel has probed fewer partitions than
2907 * blkid? Maybe the kernel prober is still
2908 * running or it got EBUSY because udev
2909 * already opened the device. Let's reprobe
2910 * the device, which is a synchronous call
2911 * that waits until probing is complete. */
2913 for (j = 0; j < 20; j++) {
2915 r = ioctl(fd, BLKRRPART, 0);
2918 if (r >= 0 || r != -EBUSY)
2921 /* If something else has the device
2922 * open, such as an udev rule, the
2923 * ioctl will return EBUSY. Since
2924 * there's no way to wait until it
2925 * isn't busy anymore, let's just wait
2926 * a bit, and try again.
2928 * This is really something they
2929 * should fix in the kernel! */
2931 usleep(50 * USEC_PER_MSEC);
2935 return log_error_errno(r, "Failed to reread partition table: %m");
2938 e = udev_enumerate_unref(e);
2941 first = udev_enumerate_get_list_entry(e);
2942 udev_list_entry_foreach(item, first) {
2943 _cleanup_udev_device_unref_ struct udev_device *q;
2945 unsigned long long flags;
2951 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2956 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2960 qn = udev_device_get_devnum(q);
2964 if (st.st_rdev == qn)
2967 node = udev_device_get_devnode(q);
2971 pp = blkid_partlist_devno_to_partition(pl, qn);
2975 flags = blkid_partition_get_flags(pp);
2977 nr = blkid_partition_get_partno(pp);
2985 if (flags & GPT_FLAG_NO_AUTO)
2988 stype = blkid_partition_get_type_string(pp);
2992 if (sd_id128_from_string(stype, &type_id) < 0)
2995 if (sd_id128_equal(type_id, GPT_HOME)) {
2997 if (home && nr >= home_nr)
3001 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3003 r = free_and_strdup(&home, node);
3007 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3009 if (srv && nr >= srv_nr)
3013 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3015 r = free_and_strdup(&srv, node);
3019 #ifdef GPT_ROOT_NATIVE
3020 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3022 if (root && nr >= root_nr)
3026 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3028 r = free_and_strdup(&root, node);
3033 #ifdef GPT_ROOT_SECONDARY
3034 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3036 if (secondary_root && nr >= secondary_root_nr)
3039 secondary_root_nr = nr;
3040 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3042 r = free_and_strdup(&secondary_root, node);
3047 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3050 multiple_generic = true;
3052 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3054 r = free_and_strdup(&generic, node);
3060 } else if (is_mbr) {
3063 if (flags != 0x80) /* Bootable flag */
3066 type = blkid_partition_get_type(pp);
3067 if (type != 0x83) /* Linux partition */
3071 multiple_generic = true;
3075 r = free_and_strdup(&root, node);
3083 *root_device = root;
3086 *root_device_rw = root_rw;
3088 } else if (secondary_root) {
3089 *root_device = secondary_root;
3090 secondary_root = NULL;
3092 *root_device_rw = secondary_root_rw;
3094 } else if (generic) {
3096 /* There were no partitions with precise meanings
3097 * around, but we found generic partitions. In this
3098 * case, if there's only one, we can go ahead and boot
3099 * it, otherwise we bail out, because we really cannot
3100 * make any sense of it. */
3102 if (multiple_generic) {
3103 log_error("Identified multiple bootable Linux partitions on\n"
3105 PARTITION_TABLE_BLURB, arg_image);
3109 *root_device = generic;
3112 *root_device_rw = generic_rw;
3115 log_error("Failed to identify root partition in disk image\n"
3117 PARTITION_TABLE_BLURB, arg_image);
3122 *home_device = home;
3125 *home_device_rw = home_rw;
3132 *srv_device_rw = srv_rw;
3137 log_error("--image= is not supported, compiled without blkid support.");
3142 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3144 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3145 const char *fstype, *p;
3155 p = strjoina(where, directory);
3160 b = blkid_new_probe_from_filename(what);
3164 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3168 blkid_probe_enable_superblocks(b, 1);
3169 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3172 r = blkid_do_safeprobe(b);
3173 if (r == -1 || r == 1) {
3174 log_error("Cannot determine file system type of %s", what);
3176 } else if (r != 0) {
3179 log_error_errno(errno, "Failed to probe %s: %m", what);
3184 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3187 log_error("Failed to determine file system type of %s", what);
3191 if (streq(fstype, "crypto_LUKS")) {
3192 log_error("nspawn currently does not support LUKS disk images.");
3196 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3197 return log_error_errno(errno, "Failed to mount %s: %m", what);
3201 log_error("--image= is not supported, compiled without blkid support.");
3206 static int mount_devices(
3208 const char *root_device, bool root_device_rw,
3209 const char *home_device, bool home_device_rw,
3210 const char *srv_device, bool srv_device_rw) {
3216 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3218 return log_error_errno(r, "Failed to mount root directory: %m");
3222 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3224 return log_error_errno(r, "Failed to mount home directory: %m");
3228 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3230 return log_error_errno(r, "Failed to mount server data directory: %m");
3236 static void loop_remove(int nr, int *image_fd) {
3237 _cleanup_close_ int control = -1;
3243 if (image_fd && *image_fd >= 0) {
3244 r = ioctl(*image_fd, LOOP_CLR_FD);
3246 log_debug_errno(errno, "Failed to close loop image: %m");
3247 *image_fd = safe_close(*image_fd);
3250 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3252 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3256 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3258 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3261 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3269 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3270 return log_error_errno(errno, "Failed to allocate pipe: %m");
3274 return log_error_errno(errno, "Failed to fork getent child: %m");
3275 else if (pid == 0) {
3277 char *empty_env = NULL;
3279 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3280 _exit(EXIT_FAILURE);
3282 if (pipe_fds[0] > 2)
3283 safe_close(pipe_fds[0]);
3284 if (pipe_fds[1] > 2)
3285 safe_close(pipe_fds[1]);
3287 nullfd = open("/dev/null", O_RDWR);
3289 _exit(EXIT_FAILURE);
3291 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3292 _exit(EXIT_FAILURE);
3294 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3295 _exit(EXIT_FAILURE);
3300 reset_all_signal_handlers();
3301 close_all_fds(NULL, 0);
3303 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3304 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3305 _exit(EXIT_FAILURE);
3308 pipe_fds[1] = safe_close(pipe_fds[1]);
3315 static int change_uid_gid(char **_home) {
3316 char line[LINE_MAX], *x, *u, *g, *h;
3317 const char *word, *state;
3318 _cleanup_free_ uid_t *uids = NULL;
3319 _cleanup_free_ char *home = NULL;
3320 _cleanup_fclose_ FILE *f = NULL;
3321 _cleanup_close_ int fd = -1;
3322 unsigned n_uids = 0;
3331 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3332 /* Reset everything fully to 0, just in case */
3334 if (setgroups(0, NULL) < 0)
3335 return log_error_errno(errno, "setgroups() failed: %m");
3337 if (setresgid(0, 0, 0) < 0)
3338 return log_error_errno(errno, "setregid() failed: %m");
3340 if (setresuid(0, 0, 0) < 0)
3341 return log_error_errno(errno, "setreuid() failed: %m");
3347 /* First, get user credentials */
3348 fd = spawn_getent("passwd", arg_user, &pid);
3352 f = fdopen(fd, "r");
3357 if (!fgets(line, sizeof(line), f)) {
3360 log_error("Failed to resolve user %s.", arg_user);
3364 log_error_errno(errno, "Failed to read from getent: %m");
3370 wait_for_terminate_and_warn("getent passwd", pid, true);
3372 x = strchr(line, ':');
3374 log_error("/etc/passwd entry has invalid user field.");
3378 u = strchr(x+1, ':');
3380 log_error("/etc/passwd entry has invalid password field.");
3387 log_error("/etc/passwd entry has invalid UID field.");
3395 log_error("/etc/passwd entry has invalid GID field.");
3400 h = strchr(x+1, ':');
3402 log_error("/etc/passwd entry has invalid GECOS field.");
3409 log_error("/etc/passwd entry has invalid home directory field.");
3415 r = parse_uid(u, &uid);
3417 log_error("Failed to parse UID of user.");
3421 r = parse_gid(g, &gid);
3423 log_error("Failed to parse GID of user.");
3431 /* Second, get group memberships */
3432 fd = spawn_getent("initgroups", arg_user, &pid);
3437 f = fdopen(fd, "r");
3442 if (!fgets(line, sizeof(line), f)) {
3444 log_error("Failed to resolve user %s.", arg_user);
3448 log_error_errno(errno, "Failed to read from getent: %m");
3454 wait_for_terminate_and_warn("getent initgroups", pid, true);
3456 /* Skip over the username and subsequent separator whitespace */
3458 x += strcspn(x, WHITESPACE);
3459 x += strspn(x, WHITESPACE);
3461 FOREACH_WORD(word, l, x, state) {
3467 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3470 r = parse_uid(c, &uids[n_uids++]);
3472 log_error("Failed to parse group data from getent.");
3477 r = mkdir_parents(home, 0775);
3479 return log_error_errno(r, "Failed to make home root directory: %m");
3481 r = mkdir_safe(home, 0755, uid, gid);
3482 if (r < 0 && r != -EEXIST)
3483 return log_error_errno(r, "Failed to make home directory: %m");
3485 fchown(STDIN_FILENO, uid, gid);
3486 fchown(STDOUT_FILENO, uid, gid);
3487 fchown(STDERR_FILENO, uid, gid);
3489 if (setgroups(n_uids, uids) < 0)
3490 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3492 if (setresgid(gid, gid, gid) < 0)
3493 return log_error_errno(errno, "setregid() failed: %m");
3495 if (setresuid(uid, uid, uid) < 0)
3496 return log_error_errno(errno, "setreuid() failed: %m");
3508 * < 0 : wait_for_terminate() failed to get the state of the
3509 * container, the container was terminated by a signal, or
3510 * failed for an unknown reason. No change is made to the
3511 * container argument.
3512 * > 0 : The program executed in the container terminated with an
3513 * error. The exit code of the program executed in the
3514 * container is returned. The container argument has been set
3515 * to CONTAINER_TERMINATED.
3516 * 0 : The container is being rebooted, has been shut down or exited
3517 * successfully. The container argument has been set to either
3518 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3520 * That is, success is indicated by a return value of zero, and an
3521 * error is indicated by a non-zero value.
3523 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3527 r = wait_for_terminate(pid, &status);
3529 return log_warning_errno(r, "Failed to wait for container: %m");
3531 switch (status.si_code) {
3534 if (status.si_status == 0) {
3535 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3538 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3540 *container = CONTAINER_TERMINATED;
3541 return status.si_status;
3544 if (status.si_status == SIGINT) {
3546 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3547 *container = CONTAINER_TERMINATED;
3550 } else if (status.si_status == SIGHUP) {
3552 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3553 *container = CONTAINER_REBOOTED;
3557 /* CLD_KILLED fallthrough */
3560 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3564 log_error("Container %s failed due to unknown reason.", arg_machine);
3571 static void nop_handler(int sig) {}
3573 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3576 pid = PTR_TO_UINT32(userdata);
3578 if (kill(pid, arg_kill_signal) >= 0) {
3579 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3580 sd_event_source_set_userdata(s, NULL);
3585 sd_event_exit(sd_event_source_get_event(s), 0);
3589 static int determine_names(void) {
3592 if (!arg_image && !arg_directory) {
3594 _cleanup_(image_unrefp) Image *i = NULL;
3596 r = image_find(arg_machine, &i);
3598 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3600 log_error("No image for machine '%s': %m", arg_machine);
3604 if (i->type == IMAGE_RAW)
3605 r = set_sanitized_path(&arg_image, i->path);
3607 r = set_sanitized_path(&arg_directory, i->path);
3609 return log_error_errno(r, "Invalid image directory: %m");
3611 arg_read_only = arg_read_only || i->read_only;
3613 arg_directory = get_current_dir_name();
3615 if (!arg_directory && !arg_machine) {
3616 log_error("Failed to determine path, please use -D or -i.");
3622 if (arg_directory && path_equal(arg_directory, "/"))
3623 arg_machine = gethostname_malloc();
3625 arg_machine = strdup(basename(arg_image ?: arg_directory));
3630 hostname_cleanup(arg_machine, false);
3631 if (!machine_name_is_valid(arg_machine)) {
3632 log_error("Failed to determine machine name automatically, please use -M.");
3636 if (arg_ephemeral) {
3639 /* Add a random suffix when this is an
3640 * ephemeral machine, so that we can run many
3641 * instances at once without manually having
3642 * to specify -M each time. */
3644 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3655 static int determine_uid_shift(void) {
3661 if (arg_uid_shift == UID_INVALID) {
3664 r = stat(arg_directory, &st);
3666 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3668 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3670 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3671 log_error("UID and GID base of %s don't match.", arg_directory);
3675 arg_uid_range = UINT32_C(0x10000);
3678 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3679 log_error("UID base too high for UID range.");
3683 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3687 int main(int argc, char *argv[]) {
3689 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3690 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3691 _cleanup_close_ int master = -1, image_fd = -1;
3692 _cleanup_fdset_free_ FDSet *fds = NULL;
3693 int r, n_fd_passed, loop_nr = -1;
3694 char veth_name[IFNAMSIZ];
3695 bool secondary = false, remove_subvol = false;
3696 sigset_t mask, mask_chld;
3698 int ret = EXIT_SUCCESS;
3699 union in_addr_union exposed = {};
3700 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3703 log_parse_environment();
3706 r = parse_argv(argc, argv);
3710 r = determine_names();
3714 if (geteuid() != 0) {
3715 log_error("Need to be root.");
3720 if (sd_booted() <= 0) {
3721 log_error("Not running on a systemd system.");
3727 n_fd_passed = sd_listen_fds(false);
3728 if (n_fd_passed > 0) {
3729 r = fdset_new_listen_fds(&fds, false);
3731 log_error_errno(r, "Failed to collect file descriptors: %m");
3735 fdset_close_others(fds);
3738 if (arg_directory) {
3741 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3742 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3747 if (arg_ephemeral) {
3750 /* If the specified path is a mount point we
3751 * generate the new snapshot immediately
3752 * inside it under a random name. However if
3753 * the specified is not a mount point we
3754 * create the new snapshot in the parent
3755 * directory, just next to it. */
3756 r = path_is_mount_point(arg_directory, false);
3758 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3762 r = tempfn_random_child(arg_directory, &np);
3764 r = tempfn_random(arg_directory, &np);
3766 log_error_errno(r, "Failed to generate name for snapshot: %m");
3770 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3772 log_error_errno(r, "Failed to lock %s: %m", np);
3776 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3779 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3783 free(arg_directory);
3786 remove_subvol = true;
3789 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3791 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3795 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3800 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3803 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3805 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3809 log_info("Populated %s from template %s.", arg_directory, arg_template);
3815 if (path_is_os_tree(arg_directory) <= 0) {
3816 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3823 p = strjoina(arg_directory,
3824 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3825 if (access(p, F_OK) < 0) {
3826 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3833 char template[] = "/tmp/nspawn-root-XXXXXX";
3836 assert(!arg_template);
3838 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3840 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3844 r = log_error_errno(r, "Failed to create image lock: %m");
3848 if (!mkdtemp(template)) {
3849 log_error_errno(errno, "Failed to create temporary directory: %m");
3854 arg_directory = strdup(template);
3855 if (!arg_directory) {
3860 image_fd = setup_image(&device_path, &loop_nr);
3866 r = dissect_image(image_fd,
3867 &root_device, &root_device_rw,
3868 &home_device, &home_device_rw,
3869 &srv_device, &srv_device_rw,
3875 r = determine_uid_shift();
3879 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3881 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3883 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3887 r = ptsname_malloc(master, &console);
3889 r = log_error_errno(r, "Failed to determine tty name: %m");
3893 if (unlockpt(master) < 0) {
3894 r = log_error_errno(errno, "Failed to unlock tty: %m");
3899 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3900 arg_machine, arg_image ?: arg_directory);
3902 assert_se(sigemptyset(&mask) == 0);
3903 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3904 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3906 assert_se(sigemptyset(&mask_chld) == 0);
3907 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3910 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3911 ContainerStatus container_status;
3912 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3913 struct sigaction sa = {
3914 .sa_handler = nop_handler,
3915 .sa_flags = SA_NOCLDSTOP,
3918 r = barrier_create(&barrier);
3920 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3924 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3925 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3929 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3930 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3934 /* Child can be killed before execv(), so handle SIGCHLD
3935 * in order to interrupt parent's blocking calls and
3936 * give it a chance to call wait() and terminate. */
3937 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3939 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3943 r = sigaction(SIGCHLD, &sa, NULL);
3945 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3949 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3950 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3951 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3953 if (errno == EINVAL)
3954 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3956 r = log_error_errno(errno, "clone() failed: %m");
3963 _cleanup_free_ char *home = NULL;
3965 const char *envp[] = {
3966 "PATH=" DEFAULT_PATH_SPLIT_USR,
3967 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3972 NULL, /* container_uuid */
3973 NULL, /* LISTEN_FDS */
3974 NULL, /* LISTEN_PID */
3979 barrier_set_role(&barrier, BARRIER_CHILD);
3981 envp[n_env] = strv_find_prefix(environ, "TERM=");
3985 master = safe_close(master);
3987 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3988 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3990 reset_all_signal_handlers();
3991 reset_signal_mask();
3994 close_nointr(STDIN_FILENO);
3995 close_nointr(STDOUT_FILENO);
3996 close_nointr(STDERR_FILENO);
3998 r = open_terminal(console, O_RDWR);
3999 if (r != STDIN_FILENO) {
4005 log_error_errno(r, "Failed to open console: %m");
4006 _exit(EXIT_FAILURE);
4009 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4010 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4011 log_error_errno(errno, "Failed to duplicate console: %m");
4012 _exit(EXIT_FAILURE);
4017 log_error_errno(errno, "setsid() failed: %m");
4018 _exit(EXIT_FAILURE);
4021 if (reset_audit_loginuid() < 0)
4022 _exit(EXIT_FAILURE);
4024 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4025 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4026 _exit(EXIT_FAILURE);
4029 if (arg_private_network)
4032 /* Mark everything as slave, so that we still
4033 * receive mounts from the real root, but don't
4034 * propagate mounts to the real root. */
4035 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4036 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4037 _exit(EXIT_FAILURE);
4040 if (mount_devices(arg_directory,
4041 root_device, root_device_rw,
4042 home_device, home_device_rw,
4043 srv_device, srv_device_rw) < 0)
4044 _exit(EXIT_FAILURE);
4046 /* Turn directory into bind mount */
4047 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4048 log_error_errno(errno, "Failed to make bind mount: %m");
4049 _exit(EXIT_FAILURE);
4052 r = setup_volatile(arg_directory);
4054 _exit(EXIT_FAILURE);
4056 if (setup_volatile_state(arg_directory) < 0)
4057 _exit(EXIT_FAILURE);
4059 r = base_filesystem_create(arg_directory);
4061 _exit(EXIT_FAILURE);
4063 if (arg_read_only) {
4064 r = bind_remount_recursive(arg_directory, true);
4066 log_error_errno(r, "Failed to make tree read-only: %m");
4067 _exit(EXIT_FAILURE);
4071 if (mount_all(arg_directory) < 0)
4072 _exit(EXIT_FAILURE);
4074 if (copy_devnodes(arg_directory) < 0)
4075 _exit(EXIT_FAILURE);
4077 if (setup_ptmx(arg_directory) < 0)
4078 _exit(EXIT_FAILURE);
4080 dev_setup(arg_directory);
4082 if (setup_propagate(arg_directory) < 0)
4083 _exit(EXIT_FAILURE);
4085 if (setup_seccomp() < 0)
4086 _exit(EXIT_FAILURE);
4088 if (setup_dev_console(arg_directory, console) < 0)
4089 _exit(EXIT_FAILURE);
4091 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4092 _exit(EXIT_FAILURE);
4093 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4095 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4096 _exit(EXIT_FAILURE);
4097 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4099 /* Tell the parent that we are ready, and that
4100 * it can cgroupify us to that we lack access
4101 * to certain devices and resources. */
4102 (void) barrier_place(&barrier); /* #1 */
4104 if (setup_boot_id(arg_directory) < 0)
4105 _exit(EXIT_FAILURE);
4107 if (setup_timezone(arg_directory) < 0)
4108 _exit(EXIT_FAILURE);
4110 if (setup_resolv_conf(arg_directory) < 0)
4111 _exit(EXIT_FAILURE);
4113 if (setup_journal(arg_directory) < 0)
4114 _exit(EXIT_FAILURE);
4116 if (mount_binds(arg_directory, arg_bind, false) < 0)
4117 _exit(EXIT_FAILURE);
4119 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4120 _exit(EXIT_FAILURE);
4122 if (mount_tmpfs(arg_directory) < 0)
4123 _exit(EXIT_FAILURE);
4125 /* Wait until we are cgroup-ified, so that we
4126 * can mount the right cgroup path writable */
4127 (void) barrier_place_and_sync(&barrier); /* #2 */
4129 if (mount_cgroup(arg_directory) < 0)
4130 _exit(EXIT_FAILURE);
4132 if (chdir(arg_directory) < 0) {
4133 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4134 _exit(EXIT_FAILURE);
4137 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4138 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4139 _exit(EXIT_FAILURE);
4142 if (chroot(".") < 0) {
4143 log_error_errno(errno, "chroot() failed: %m");
4144 _exit(EXIT_FAILURE);
4147 if (chdir("/") < 0) {
4148 log_error_errno(errno, "chdir() failed: %m");
4149 _exit(EXIT_FAILURE);
4153 if (unshare(CLONE_NEWUSER) < 0) {
4154 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4155 _exit(EXIT_FAILURE);
4158 /* Tell the parent, that it now can
4159 * write the UID map. */
4160 (void) barrier_place(&barrier); /* #3 */
4162 /* Wait until the parent wrote the UID
4164 (void) barrier_place_and_sync(&barrier); /* #4 */
4169 if (drop_capabilities() < 0) {
4170 log_error_errno(errno, "drop_capabilities() failed: %m");
4171 _exit(EXIT_FAILURE);
4176 if (arg_personality != 0xffffffffLU) {
4177 if (personality(arg_personality) < 0) {
4178 log_error_errno(errno, "personality() failed: %m");
4179 _exit(EXIT_FAILURE);
4181 } else if (secondary) {
4182 if (personality(PER_LINUX32) < 0) {
4183 log_error_errno(errno, "personality() failed: %m");
4184 _exit(EXIT_FAILURE);
4189 if (arg_selinux_context)
4190 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4191 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4192 _exit(EXIT_FAILURE);
4196 r = change_uid_gid(&home);
4198 _exit(EXIT_FAILURE);
4200 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4201 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4202 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4204 _exit(EXIT_FAILURE);
4207 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4210 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4212 _exit(EXIT_FAILURE);
4216 if (fdset_size(fds) > 0) {
4217 r = fdset_cloexec(fds, false);
4219 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4220 _exit(EXIT_FAILURE);
4223 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4224 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4226 _exit(EXIT_FAILURE);
4230 if (!strv_isempty(arg_setenv)) {
4233 n = strv_env_merge(2, envp, arg_setenv);
4236 _exit(EXIT_FAILURE);
4241 env_use = (char**) envp;
4243 /* Let the parent know that we are ready and
4244 * wait until the parent is ready with the
4246 (void) barrier_place_and_sync(&barrier); /* #5 */
4252 /* Automatically search for the init system */
4254 l = 1 + argc - optind;
4255 a = newa(char*, l + 1);
4256 memcpy(a + 1, argv + optind, l * sizeof(char*));
4258 a[0] = (char*) "/usr/lib/systemd/systemd";
4259 execve(a[0], a, env_use);
4261 a[0] = (char*) "/lib/systemd/systemd";
4262 execve(a[0], a, env_use);
4264 a[0] = (char*) "/sbin/init";
4265 execve(a[0], a, env_use);
4266 } else if (argc > optind)
4267 execvpe(argv[optind], argv + optind, env_use);
4269 chdir(home ? home : "/root");
4270 execle("/bin/bash", "-bash", NULL, env_use);
4271 execle("/bin/sh", "-sh", NULL, env_use);
4274 log_error_errno(errno, "execv() failed: %m");
4275 _exit(EXIT_FAILURE);
4278 barrier_set_role(&barrier, BARRIER_PARENT);
4282 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4283 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4285 (void) barrier_place(&barrier); /* #1 */
4287 /* Wait for the most basic Child-setup to be done,
4288 * before we add hardware to it, and place it in a
4290 if (barrier_sync(&barrier)) { /* #1 */
4293 r = move_network_interfaces(pid);
4297 r = setup_veth(pid, veth_name, &ifi);
4301 r = setup_bridge(veth_name, &ifi);
4305 r = setup_macvlan(pid);
4309 r = setup_ipvlan(pid);
4313 r = register_machine(pid, ifi);
4317 /* Notify the child that the parent is ready with all
4318 * its setup, and that the child can now hand over
4319 * control to the code to run inside the container. */
4320 (void) barrier_place(&barrier); /* #2 */
4323 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4325 (void) barrier_place_and_sync(&barrier); /* #3 */
4327 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4328 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4329 r = write_string_file(uid_map, line);
4331 log_error_errno(r, "Failed to write UID map: %m");
4335 /* We always assign the same UID and GID ranges */
4336 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4337 r = write_string_file(uid_map, line);
4339 log_error_errno(r, "Failed to write GID map: %m");
4343 (void) barrier_place(&barrier); /* #4 */
4346 /* Block SIGCHLD here, before notifying child.
4347 * process_pty() will handle it with the other signals. */
4348 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4352 /* Reset signal to default */
4353 r = default_signals(SIGCHLD, -1);
4357 /* Let the child know that we are ready and wait that the child is completely ready now. */
4358 if (barrier_place_and_sync(&barrier)) { /* #5 */
4359 _cleanup_event_unref_ sd_event *event = NULL;
4360 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4361 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4366 "STATUS=Container running.\n"
4367 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4369 r = sd_event_new(&event);
4371 log_error_errno(r, "Failed to get default event source: %m");
4375 if (arg_kill_signal > 0) {
4376 /* Try to kill the init system on SIGINT or SIGTERM */
4377 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4378 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4380 /* Immediately exit */
4381 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4382 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4385 /* simply exit on sigchld */
4386 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4388 if (arg_expose_ports) {
4389 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4393 (void) expose_ports(rtnl, &exposed);
4396 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4398 r = pty_forward_new(event, master, true, !interactive, &forward);
4400 log_error_errno(r, "Failed to create PTY forwarder: %m");
4404 r = sd_event_loop(event);
4406 log_error_errno(r, "Failed to run event loop: %m");
4410 pty_forward_get_last_char(forward, &last_char);
4412 forward = pty_forward_free(forward);
4414 if (!arg_quiet && last_char != '\n')
4417 /* Kill if it is not dead yet anyway */
4418 terminate_machine(pid);
4422 /* Normally redundant, but better safe than sorry */
4425 r = wait_for_container(pid, &container_status);
4429 /* We failed to wait for the container, or the
4430 * container exited abnormally */
4432 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4433 /* The container exited with a non-zero
4434 * status, or with zero status and no reboot
4440 /* CONTAINER_REBOOTED, loop again */
4442 if (arg_keep_unit) {
4443 /* Special handling if we are running as a
4444 * service: instead of simply restarting the
4445 * machine we want to restart the entire
4446 * service, so let's inform systemd about this
4447 * with the special exit code 133. The service
4448 * file uses RestartForceExitStatus=133 so
4449 * that this results in a full nspawn
4450 * restart. This is necessary since we might
4451 * have cgroup parameters set we want to have
4458 flush_ports(&exposed);
4464 "STATUS=Terminating...");
4466 loop_remove(loop_nr, &image_fd);
4471 if (remove_subvol && arg_directory) {
4474 k = btrfs_subvol_remove(arg_directory);
4476 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4482 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4483 (void) rm_rf(p, false, true, false);
4486 free(arg_directory);
4491 strv_free(arg_setenv);
4492 strv_free(arg_network_interfaces);
4493 strv_free(arg_network_macvlan);
4494 strv_free(arg_network_ipvlan);
4495 strv_free(arg_bind);
4496 strv_free(arg_bind_ro);
4497 strv_free(arg_tmpfs);
4499 flush_ports(&exposed);
4501 while (arg_expose_ports) {
4502 ExposePort *p = arg_expose_ports;
4503 LIST_REMOVE(ports, arg_expose_ports, p);
4507 return r < 0 ? EXIT_FAILURE : ret;