1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
64 #include "cgroup-util.h"
66 #include "path-util.h"
67 #include "loopback-setup.h"
68 #include "dev-setup.h"
73 #include "bus-error.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
78 #include "blkid-util.h"
80 #include "siphash24.h"
82 #include "base-filesystem.h"
84 #include "event-util.h"
85 #include "capability.h"
87 #include "btrfs-util.h"
88 #include "machine-image.h"
90 #include "in-addr-util.h"
92 #include "local-addresses.h"
95 #include "seccomp-util.h"
98 typedef struct ExposePort {
101 uint16_t container_port;
102 LIST_FIELDS(struct ExposePort, ports);
105 typedef enum ContainerStatus {
106 CONTAINER_TERMINATED,
110 typedef enum LinkJournal {
117 typedef enum Volatile {
123 static char *arg_directory = NULL;
124 static char *arg_template = NULL;
125 static char *arg_user = NULL;
126 static sd_id128_t arg_uuid = {};
127 static char *arg_machine = NULL;
128 static const char *arg_selinux_context = NULL;
129 static const char *arg_selinux_apifs_context = NULL;
130 static const char *arg_slice = NULL;
131 static bool arg_private_network = false;
132 static bool arg_read_only = false;
133 static bool arg_boot = false;
134 static bool arg_ephemeral = false;
135 static LinkJournal arg_link_journal = LINK_AUTO;
136 static bool arg_link_journal_try = false;
137 static uint64_t arg_retain =
138 (1ULL << CAP_CHOWN) |
139 (1ULL << CAP_DAC_OVERRIDE) |
140 (1ULL << CAP_DAC_READ_SEARCH) |
141 (1ULL << CAP_FOWNER) |
142 (1ULL << CAP_FSETID) |
143 (1ULL << CAP_IPC_OWNER) |
145 (1ULL << CAP_LEASE) |
146 (1ULL << CAP_LINUX_IMMUTABLE) |
147 (1ULL << CAP_NET_BIND_SERVICE) |
148 (1ULL << CAP_NET_BROADCAST) |
149 (1ULL << CAP_NET_RAW) |
150 (1ULL << CAP_SETGID) |
151 (1ULL << CAP_SETFCAP) |
152 (1ULL << CAP_SETPCAP) |
153 (1ULL << CAP_SETUID) |
154 (1ULL << CAP_SYS_ADMIN) |
155 (1ULL << CAP_SYS_CHROOT) |
156 (1ULL << CAP_SYS_NICE) |
157 (1ULL << CAP_SYS_PTRACE) |
158 (1ULL << CAP_SYS_TTY_CONFIG) |
159 (1ULL << CAP_SYS_RESOURCE) |
160 (1ULL << CAP_SYS_BOOT) |
161 (1ULL << CAP_AUDIT_WRITE) |
162 (1ULL << CAP_AUDIT_CONTROL) |
164 static char **arg_bind = NULL;
165 static char **arg_bind_ro = NULL;
166 static char **arg_tmpfs = NULL;
167 static char **arg_setenv = NULL;
168 static bool arg_quiet = false;
169 static bool arg_share_system = false;
170 static bool arg_register = true;
171 static bool arg_keep_unit = false;
172 static char **arg_network_interfaces = NULL;
173 static char **arg_network_macvlan = NULL;
174 static char **arg_network_ipvlan = NULL;
175 static bool arg_network_veth = false;
176 static const char *arg_network_bridge = NULL;
177 static unsigned long arg_personality = 0xffffffffLU;
178 static char *arg_image = NULL;
179 static Volatile arg_volatile = VOLATILE_NO;
180 static ExposePort *arg_expose_ports = NULL;
181 static char **arg_property = NULL;
182 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183 static bool arg_userns = false;
185 static void help(void) {
186 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
187 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
188 " -h --help Show this help\n"
189 " --version Print version string\n"
190 " -q --quiet Do not show status information\n"
191 " -D --directory=PATH Root directory for the container\n"
192 " --template=PATH Initialize root directory from template directory,\n"
194 " -x --ephemeral Run container with snapshot of root directory, and\n"
195 " remove it after exit\n"
196 " -i --image=PATH File system device or disk image for the container\n"
197 " -b --boot Boot up full system (i.e. invoke init)\n"
198 " -u --user=USER Run the command under specified user or uid\n"
199 " -M --machine=NAME Set the machine name for the container\n"
200 " --uuid=UUID Set a specific machine UUID for the container\n"
201 " -S --slice=SLICE Place the container in the specified slice\n"
202 " --property=NAME=VALUE Set scope unit property\n"
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " -n --network-veth Add a virtual ethernet connection between host\n"
215 " --network-bridge=INTERFACE\n"
216 " Add a virtual ethernet connection between host\n"
217 " and container and add it to an existing bridge on\n"
219 " --private-users[=UIDBASE[:NUIDS]]\n"
220 " Run within user namespace\n"
221 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
222 " Expose a container IP port on the host\n"
223 " -Z --selinux-context=SECLABEL\n"
224 " Set the SELinux security context to be used by\n"
225 " processes in the container\n"
226 " -L --selinux-apifs-context=SECLABEL\n"
227 " Set the SELinux security context to be used by\n"
228 " API/tmpfs file systems in the container\n"
229 " --capability=CAP In addition to the default, retain specified\n"
231 " --drop-capability=CAP Drop the specified capability from the default set\n"
232 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
233 " try-guest, try-host\n"
234 " -j Equivalent to --link-journal=try-guest\n"
235 " --read-only Mount the root directory read-only\n"
236 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
238 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
239 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
240 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
241 " --share-system Share system namespaces with host\n"
242 " --register=BOOLEAN Register container as machine\n"
243 " --keep-unit Do not register a scope for the machine, reuse\n"
244 " the service unit nspawn is running in\n"
245 " --volatile[=MODE] Run the system in volatile mode\n"
246 , program_invocation_short_name);
249 static int set_sanitized_path(char **b, const char *path) {
255 p = canonicalize_file_name(path);
260 p = path_make_absolute_cwd(path);
266 *b = path_kill_slashes(p);
270 static int parse_argv(int argc, char *argv[]) {
287 ARG_NETWORK_INTERFACE,
298 static const struct option options[] = {
299 { "help", no_argument, NULL, 'h' },
300 { "version", no_argument, NULL, ARG_VERSION },
301 { "directory", required_argument, NULL, 'D' },
302 { "template", required_argument, NULL, ARG_TEMPLATE },
303 { "ephemeral", no_argument, NULL, 'x' },
304 { "user", required_argument, NULL, 'u' },
305 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
306 { "boot", no_argument, NULL, 'b' },
307 { "uuid", required_argument, NULL, ARG_UUID },
308 { "read-only", no_argument, NULL, ARG_READ_ONLY },
309 { "capability", required_argument, NULL, ARG_CAPABILITY },
310 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
311 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
312 { "bind", required_argument, NULL, ARG_BIND },
313 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
314 { "tmpfs", required_argument, NULL, ARG_TMPFS },
315 { "machine", required_argument, NULL, 'M' },
316 { "slice", required_argument, NULL, 'S' },
317 { "setenv", required_argument, NULL, ARG_SETENV },
318 { "selinux-context", required_argument, NULL, 'Z' },
319 { "selinux-apifs-context", required_argument, NULL, 'L' },
320 { "quiet", no_argument, NULL, 'q' },
321 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
322 { "register", required_argument, NULL, ARG_REGISTER },
323 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
324 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
325 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
326 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
327 { "network-veth", no_argument, NULL, 'n' },
328 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
329 { "personality", required_argument, NULL, ARG_PERSONALITY },
330 { "image", required_argument, NULL, 'i' },
331 { "volatile", optional_argument, NULL, ARG_VOLATILE },
332 { "port", required_argument, NULL, 'p' },
333 { "property", required_argument, NULL, ARG_PROPERTY },
334 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
339 uint64_t plus = 0, minus = 0;
344 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
353 puts(PACKAGE_STRING);
354 puts(SYSTEMD_FEATURES);
358 r = set_sanitized_path(&arg_directory, optarg);
360 return log_error_errno(r, "Invalid root directory: %m");
365 r = set_sanitized_path(&arg_template, optarg);
367 return log_error_errno(r, "Invalid template directory: %m");
372 r = set_sanitized_path(&arg_image, optarg);
374 return log_error_errno(r, "Invalid image path: %m");
379 arg_ephemeral = true;
384 arg_user = strdup(optarg);
390 case ARG_NETWORK_BRIDGE:
391 arg_network_bridge = optarg;
396 arg_network_veth = true;
397 arg_private_network = true;
400 case ARG_NETWORK_INTERFACE:
401 if (strv_extend(&arg_network_interfaces, optarg) < 0)
404 arg_private_network = true;
407 case ARG_NETWORK_MACVLAN:
408 if (strv_extend(&arg_network_macvlan, optarg) < 0)
411 arg_private_network = true;
414 case ARG_NETWORK_IPVLAN:
415 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
420 case ARG_PRIVATE_NETWORK:
421 arg_private_network = true;
429 r = sd_id128_from_string(optarg, &arg_uuid);
431 log_error("Invalid UUID: %s", optarg);
441 if (isempty(optarg)) {
445 if (!machine_name_is_valid(optarg)) {
446 log_error("Invalid machine name: %s", optarg);
450 r = free_and_strdup(&arg_machine, optarg);
458 arg_selinux_context = optarg;
462 arg_selinux_apifs_context = optarg;
466 arg_read_only = true;
470 case ARG_DROP_CAPABILITY: {
471 const char *state, *word;
474 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
475 _cleanup_free_ char *t;
477 t = strndup(word, length);
481 if (streq(t, "all")) {
482 if (c == ARG_CAPABILITY)
483 plus = (uint64_t) -1;
485 minus = (uint64_t) -1;
489 cap = capability_from_name(t);
491 log_error("Failed to parse capability %s.", t);
495 if (c == ARG_CAPABILITY)
496 plus |= 1ULL << (uint64_t) cap;
498 minus |= 1ULL << (uint64_t) cap;
506 arg_link_journal = LINK_GUEST;
507 arg_link_journal_try = true;
510 case ARG_LINK_JOURNAL:
511 if (streq(optarg, "auto")) {
512 arg_link_journal = LINK_AUTO;
513 arg_link_journal_try = false;
514 } else if (streq(optarg, "no")) {
515 arg_link_journal = LINK_NO;
516 arg_link_journal_try = false;
517 } else if (streq(optarg, "guest")) {
518 arg_link_journal = LINK_GUEST;
519 arg_link_journal_try = false;
520 } else if (streq(optarg, "host")) {
521 arg_link_journal = LINK_HOST;
522 arg_link_journal_try = false;
523 } else if (streq(optarg, "try-guest")) {
524 arg_link_journal = LINK_GUEST;
525 arg_link_journal_try = true;
526 } else if (streq(optarg, "try-host")) {
527 arg_link_journal = LINK_HOST;
528 arg_link_journal_try = true;
530 log_error("Failed to parse link journal mode %s", optarg);
538 _cleanup_free_ char *a = NULL, *b = NULL;
542 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
544 e = strchr(optarg, ':');
546 a = strndup(optarg, e - optarg);
556 if (!path_is_absolute(a) || !path_is_absolute(b)) {
557 log_error("Invalid bind mount specification: %s", optarg);
561 r = strv_extend(x, a);
565 r = strv_extend(x, b);
573 _cleanup_free_ char *a = NULL, *b = NULL;
576 e = strchr(optarg, ':');
578 a = strndup(optarg, e - optarg);
582 b = strdup("mode=0755");
588 if (!path_is_absolute(a)) {
589 log_error("Invalid tmpfs specification: %s", optarg);
593 r = strv_push(&arg_tmpfs, a);
599 r = strv_push(&arg_tmpfs, b);
611 if (!env_assignment_is_valid(optarg)) {
612 log_error("Environment variable assignment '%s' is not valid.", optarg);
616 n = strv_env_set(arg_setenv, optarg);
620 strv_free(arg_setenv);
629 case ARG_SHARE_SYSTEM:
630 arg_share_system = true;
634 r = parse_boolean(optarg);
636 log_error("Failed to parse --register= argument: %s", optarg);
644 arg_keep_unit = true;
647 case ARG_PERSONALITY:
649 arg_personality = personality_from_string(optarg);
650 if (arg_personality == 0xffffffffLU) {
651 log_error("Unknown or unsupported personality '%s'.", optarg);
660 arg_volatile = VOLATILE_YES;
662 r = parse_boolean(optarg);
664 if (streq(optarg, "state"))
665 arg_volatile = VOLATILE_STATE;
667 log_error("Failed to parse --volatile= argument: %s", optarg);
671 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
677 const char *split, *e;
678 uint16_t container_port, host_port;
682 if ((e = startswith(optarg, "tcp:")))
683 protocol = IPPROTO_TCP;
684 else if ((e = startswith(optarg, "udp:")))
685 protocol = IPPROTO_UDP;
688 protocol = IPPROTO_TCP;
691 split = strchr(e, ':');
693 char v[split - e + 1];
695 memcpy(v, e, split - e);
698 r = safe_atou16(v, &host_port);
699 if (r < 0 || host_port <= 0) {
700 log_error("Failed to parse host port: %s", optarg);
704 r = safe_atou16(split + 1, &container_port);
706 r = safe_atou16(e, &container_port);
707 host_port = container_port;
710 if (r < 0 || container_port <= 0) {
711 log_error("Failed to parse host port: %s", optarg);
715 LIST_FOREACH(ports, p, arg_expose_ports) {
716 if (p->protocol == protocol && p->host_port == host_port) {
717 log_error("Duplicate port specification: %s", optarg);
722 p = new(ExposePort, 1);
726 p->protocol = protocol;
727 p->host_port = host_port;
728 p->container_port = container_port;
730 LIST_PREPEND(ports, arg_expose_ports, p);
736 if (strv_extend(&arg_property, optarg) < 0)
741 case ARG_PRIVATE_USERS:
743 _cleanup_free_ char *buffer = NULL;
744 const char *range, *shift;
746 range = strchr(optarg, ':');
748 buffer = strndup(optarg, range - optarg);
754 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
755 log_error("Failed to parse UID range: %s", range);
761 if (parse_uid(shift, &arg_uid_shift) < 0) {
762 log_error("Failed to parse UID: %s", optarg);
774 assert_not_reached("Unhandled option");
777 if (arg_share_system)
778 arg_register = false;
780 if (arg_boot && arg_share_system) {
781 log_error("--boot and --share-system may not be combined.");
785 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
786 log_error("--keep-unit may not be used when invoked from a user session.");
790 if (arg_directory && arg_image) {
791 log_error("--directory= and --image= may not be combined.");
795 if (arg_template && arg_image) {
796 log_error("--template= and --image= may not be combined.");
800 if (arg_template && !(arg_directory || arg_machine)) {
801 log_error("--template= needs --directory= or --machine=.");
805 if (arg_ephemeral && arg_template) {
806 log_error("--ephemeral and --template= may not be combined.");
810 if (arg_ephemeral && arg_image) {
811 log_error("--ephemeral and --image= may not be combined.");
815 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
816 log_error("--ephemeral and --link-journal= may not be combined.");
820 if (arg_volatile != VOLATILE_NO && arg_read_only) {
821 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
825 if (arg_expose_ports && !arg_private_network) {
826 log_error("Cannot use --port= without private networking.");
830 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
835 static int mount_all(const char *dest) {
837 typedef struct MountPoint {
846 static const MountPoint mount_table[] = {
847 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
848 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
849 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
850 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
851 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
852 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
853 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
854 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
855 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
857 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
858 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
865 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
866 _cleanup_free_ char *where = NULL, *options = NULL;
870 where = strjoin(dest, "/", mount_table[k].where, NULL);
874 t = path_is_mount_point(where, true);
876 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
884 /* Skip this entry if it is not a remount. */
885 if (mount_table[k].what && t > 0)
888 t = mkdir_p(where, 0755);
890 if (mount_table[k].fatal) {
891 log_error_errno(t, "Failed to create directory %s: %m", where);
896 log_warning_errno(t, "Failed to create directory %s: %m", where);
902 if (arg_selinux_apifs_context &&
903 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
904 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
911 o = mount_table[k].options;
913 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
914 char *uid_options = NULL;
917 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
919 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
924 o = options = uid_options;
927 if (mount(mount_table[k].what,
930 mount_table[k].flags,
933 if (mount_table[k].fatal) {
934 log_error_errno(errno, "mount(%s) failed: %m", where);
939 log_warning_errno(errno, "mount(%s) failed: %m", where);
946 static int mount_binds(const char *dest, char **l, bool ro) {
949 STRV_FOREACH_PAIR(x, y, l) {
950 _cleanup_free_ char *where = NULL;
951 struct stat source_st, dest_st;
954 if (stat(*x, &source_st) < 0)
955 return log_error_errno(errno, "Failed to stat %s: %m", *x);
957 where = strappend(dest, *y);
961 r = stat(where, &dest_st);
963 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
964 log_error("Cannot bind mount directory %s on file %s.", *x, where);
967 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
968 log_error("Cannot bind mount file %s on directory %s.", *x, where);
971 } else if (errno == ENOENT) {
972 r = mkdir_parents_label(where, 0755);
974 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
976 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
980 /* Create the mount point. Any non-directory file can be
981 * mounted on any non-directory file (regular, fifo, socket,
984 if (S_ISDIR(source_st.st_mode)) {
985 r = mkdir_label(where, 0755);
986 if (r < 0 && errno != EEXIST)
987 return log_error_errno(r, "Failed to create mount point %s: %m", where);
991 return log_error_errno(r, "Failed to create mount point %s: %m", where);
994 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
995 return log_error_errno(errno, "mount(%s) failed: %m", where);
998 r = bind_remount_recursive(where, true);
1000 return log_error_errno(r, "Read-Only bind mount failed: %m");
1007 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1011 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1013 r = path_is_mount_point(to, false);
1015 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1021 /* The superblock mount options of the mount point need to be
1022 * identical to the hosts', and hence writable... */
1023 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1024 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1026 /* ... hence let's only make the bind mount read-only, not the
1029 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1030 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1035 static int mount_cgroup(const char *dest) {
1036 _cleanup_set_free_free_ Set *controllers = NULL;
1037 _cleanup_free_ char *own_cgroup_path = NULL;
1038 const char *cgroup_root, *systemd_root, *systemd_own;
1041 controllers = set_new(&string_hash_ops);
1045 r = cg_kernel_controllers(controllers);
1047 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1049 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1051 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1053 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1054 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1055 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1058 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1060 controller = set_steal_first(controllers);
1064 origin = strappend("/sys/fs/cgroup/", controller);
1068 r = readlink_malloc(origin, &combined);
1070 /* Not a symbolic link, but directly a single cgroup hierarchy */
1072 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1077 return log_error_errno(r, "Failed to read link %s: %m", origin);
1079 _cleanup_free_ char *target = NULL;
1081 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1085 /* A symbolic link, a combination of controllers in one hierarchy */
1087 if (!filename_is_valid(combined)) {
1088 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1092 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1096 if (symlink(combined, target) < 0)
1097 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1101 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1105 /* Make our own cgroup a (writable) bind mount */
1106 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1107 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1108 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1110 /* And then remount the systemd cgroup root read-only */
1111 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1112 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1113 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1115 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1116 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1121 static int mount_tmpfs(const char *dest) {
1124 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1125 _cleanup_free_ char *where = NULL;
1128 where = strappend(dest, *i);
1132 r = mkdir_label(where, 0755);
1133 if (r < 0 && r != -EEXIST)
1134 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1136 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1137 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1143 static int setup_timezone(const char *dest) {
1144 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1150 /* Fix the timezone, if possible */
1151 r = readlink_malloc("/etc/localtime", &p);
1153 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1157 z = path_startswith(p, "../usr/share/zoneinfo/");
1159 z = path_startswith(p, "/usr/share/zoneinfo/");
1161 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1165 where = strappend(dest, "/etc/localtime");
1169 r = readlink_malloc(where, &q);
1171 y = path_startswith(q, "../usr/share/zoneinfo/");
1173 y = path_startswith(q, "/usr/share/zoneinfo/");
1175 /* Already pointing to the right place? Then do nothing .. */
1176 if (y && streq(y, z))
1180 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1184 if (access(check, F_OK) < 0) {
1185 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1189 what = strappend("../usr/share/zoneinfo/", z);
1193 r = mkdir_parents(where, 0755);
1195 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1201 if (r < 0 && errno != ENOENT) {
1202 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1207 if (symlink(what, where) < 0) {
1208 log_error_errno(errno, "Failed to correct timezone of container: %m");
1215 static int setup_resolv_conf(const char *dest) {
1216 _cleanup_free_ char *where = NULL;
1221 if (arg_private_network)
1224 /* Fix resolv.conf, if possible */
1225 where = strappend(dest, "/etc/resolv.conf");
1229 /* We don't really care for the results of this really. If it
1230 * fails, it fails, but meh... */
1231 r = mkdir_parents(where, 0755);
1233 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1238 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1240 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1248 static int setup_volatile_state(const char *directory) {
1254 if (arg_volatile != VOLATILE_STATE)
1257 /* --volatile=state means we simply overmount /var
1258 with a tmpfs, and the rest read-only. */
1260 r = bind_remount_recursive(directory, true);
1262 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1264 p = strjoina(directory, "/var");
1266 if (r < 0 && errno != EEXIST)
1267 return log_error_errno(errno, "Failed to create %s: %m", directory);
1269 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1270 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1275 static int setup_volatile(const char *directory) {
1276 bool tmpfs_mounted = false, bind_mounted = false;
1277 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1283 if (arg_volatile != VOLATILE_YES)
1286 /* --volatile=yes means we mount a tmpfs to the root dir, and
1287 the original /usr to use inside it, and that read-only. */
1289 if (!mkdtemp(template))
1290 return log_error_errno(errno, "Failed to create temporary directory: %m");
1292 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1293 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1298 tmpfs_mounted = true;
1300 f = strjoina(directory, "/usr");
1301 t = strjoina(template, "/usr");
1304 if (r < 0 && errno != EEXIST) {
1305 log_error_errno(errno, "Failed to create %s: %m", t);
1310 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1311 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1316 bind_mounted = true;
1318 r = bind_remount_recursive(t, true);
1320 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1324 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1325 log_error_errno(errno, "Failed to move root mount: %m");
1343 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1346 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1347 SD_ID128_FORMAT_VAL(id));
1352 static int setup_boot_id(const char *dest) {
1353 _cleanup_free_ char *from = NULL, *to = NULL;
1354 sd_id128_t rnd = {};
1360 if (arg_share_system)
1363 /* Generate a new randomized boot ID, so that each boot-up of
1364 * the container gets a new one */
1366 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1367 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1371 r = sd_id128_randomize(&rnd);
1373 return log_error_errno(r, "Failed to generate random boot id: %m");
1375 id128_format_as_uuid(rnd, as_uuid);
1377 r = write_string_file(from, as_uuid);
1379 return log_error_errno(r, "Failed to write boot id: %m");
1381 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1382 log_error_errno(errno, "Failed to bind mount boot id: %m");
1384 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1385 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1391 static int copy_devnodes(const char *dest) {
1393 static const char devnodes[] =
1404 _cleanup_umask_ mode_t u;
1410 NULSTR_FOREACH(d, devnodes) {
1411 _cleanup_free_ char *from = NULL, *to = NULL;
1414 from = strappend("/dev/", d);
1415 to = strjoin(dest, "/dev/", d, NULL);
1419 if (stat(from, &st) < 0) {
1421 if (errno != ENOENT)
1422 return log_error_errno(errno, "Failed to stat %s: %m", from);
1424 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1426 log_error("%s is not a char or block device, cannot copy", from);
1430 r = mkdir_parents(to, 0775);
1432 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1436 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1437 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1439 if (arg_userns && arg_uid_shift != UID_INVALID)
1440 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1441 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1448 static int setup_ptmx(const char *dest) {
1449 _cleanup_free_ char *p = NULL;
1451 p = strappend(dest, "/dev/ptmx");
1455 if (symlink("pts/ptmx", p) < 0)
1456 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1458 if (arg_userns && arg_uid_shift != UID_INVALID)
1459 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1460 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1465 static int setup_dev_console(const char *dest, const char *console) {
1466 _cleanup_umask_ mode_t u;
1476 if (stat("/dev/null", &st) < 0)
1477 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1479 r = chmod_and_chown(console, 0600, 0, 0);
1481 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1483 /* We need to bind mount the right tty to /dev/console since
1484 * ptys can only exist on pts file systems. To have something
1485 * to bind mount things on we create a device node first, and
1486 * use /dev/null for that since we the cgroups device policy
1487 * allows us to create that freely, while we cannot create
1488 * /dev/console. (Note that the major minor doesn't actually
1489 * matter here, since we mount it over anyway). */
1491 to = strjoina(dest, "/dev/console");
1492 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1493 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1495 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1496 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1501 static int setup_kmsg(const char *dest, int kmsg_socket) {
1502 _cleanup_free_ char *from = NULL, *to = NULL;
1503 _cleanup_umask_ mode_t u;
1506 struct cmsghdr cmsghdr;
1507 uint8_t buf[CMSG_SPACE(sizeof(int))];
1509 struct msghdr mh = {
1510 .msg_control = &control,
1511 .msg_controllen = sizeof(control),
1513 struct cmsghdr *cmsg;
1516 assert(kmsg_socket >= 0);
1520 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1521 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1522 * on the reading side behave very similar to /proc/kmsg,
1523 * their writing side behaves differently from /dev/kmsg in
1524 * that writing blocks when nothing is reading. In order to
1525 * avoid any problems with containers deadlocking due to this
1526 * we simply make /dev/kmsg unavailable to the container. */
1527 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1528 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1531 if (mkfifo(from, 0600) < 0)
1532 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1534 r = chmod_and_chown(from, 0600, 0, 0);
1536 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1538 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1539 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1541 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1543 return log_error_errno(errno, "Failed to open fifo: %m");
1545 cmsg = CMSG_FIRSTHDR(&mh);
1546 cmsg->cmsg_level = SOL_SOCKET;
1547 cmsg->cmsg_type = SCM_RIGHTS;
1548 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1549 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1551 mh.msg_controllen = cmsg->cmsg_len;
1553 /* Store away the fd in the socket, so that it stays open as
1554 * long as we run the child */
1555 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1559 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1561 /* And now make the FIFO unavailable as /dev/kmsg... */
1566 static int send_rtnl(int send_fd) {
1568 struct cmsghdr cmsghdr;
1569 uint8_t buf[CMSG_SPACE(sizeof(int))];
1571 struct msghdr mh = {
1572 .msg_control = &control,
1573 .msg_controllen = sizeof(control),
1575 struct cmsghdr *cmsg;
1576 _cleanup_close_ int fd = -1;
1579 assert(send_fd >= 0);
1581 if (!arg_expose_ports)
1584 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1586 return log_error_errno(errno, "failed to allocate container netlink: %m");
1588 cmsg = CMSG_FIRSTHDR(&mh);
1589 cmsg->cmsg_level = SOL_SOCKET;
1590 cmsg->cmsg_type = SCM_RIGHTS;
1591 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1592 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1594 mh.msg_controllen = cmsg->cmsg_len;
1596 /* Store away the fd in the socket, so that it stays open as
1597 * long as we run the child */
1598 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1600 return log_error_errno(errno, "Failed to send netlink fd: %m");
1605 static int flush_ports(union in_addr_union *exposed) {
1607 int r, af = AF_INET;
1611 if (!arg_expose_ports)
1614 if (in_addr_is_null(af, exposed))
1617 log_debug("Lost IP address.");
1619 LIST_FOREACH(ports, p, arg_expose_ports) {
1620 r = fw_add_local_dnat(false,
1631 log_warning_errno(r, "Failed to modify firewall: %m");
1634 *exposed = IN_ADDR_NULL;
1638 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1639 _cleanup_free_ struct local_address *addresses = NULL;
1640 _cleanup_free_ char *pretty = NULL;
1641 union in_addr_union new_exposed;
1644 int af = AF_INET, r;
1648 /* Invoked each time an address is added or removed inside the
1651 if (!arg_expose_ports)
1654 r = local_addresses(rtnl, 0, af, &addresses);
1656 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1659 addresses[0].family == af &&
1660 addresses[0].scope < RT_SCOPE_LINK;
1663 return flush_ports(exposed);
1665 new_exposed = addresses[0].address;
1666 if (in_addr_equal(af, exposed, &new_exposed))
1669 in_addr_to_string(af, &new_exposed, &pretty);
1670 log_debug("New container IP is %s.", strna(pretty));
1672 LIST_FOREACH(ports, p, arg_expose_ports) {
1674 r = fw_add_local_dnat(true,
1683 in_addr_is_null(af, exposed) ? NULL : exposed);
1685 log_warning_errno(r, "Failed to modify firewall: %m");
1688 *exposed = new_exposed;
1692 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1693 union in_addr_union *exposed = userdata;
1699 expose_ports(rtnl, exposed);
1703 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1705 struct cmsghdr cmsghdr;
1706 uint8_t buf[CMSG_SPACE(sizeof(int))];
1708 struct msghdr mh = {
1709 .msg_control = &control,
1710 .msg_controllen = sizeof(control),
1712 struct cmsghdr *cmsg;
1713 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1718 assert(recv_fd >= 0);
1721 if (!arg_expose_ports)
1724 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1726 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1728 cmsg = CMSG_FIRSTHDR(&mh);
1729 assert(cmsg->cmsg_level == SOL_SOCKET);
1730 assert(cmsg->cmsg_type == SCM_RIGHTS);
1731 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1732 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1734 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1737 return log_error_errno(r, "Failed to create rtnl object: %m");
1740 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1742 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1744 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1746 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1748 r = sd_rtnl_attach_event(rtnl, event, 0);
1750 return log_error_errno(r, "Failed to add to even loop: %m");
1758 static int setup_hostname(void) {
1760 if (arg_share_system)
1763 if (sethostname_idempotent(arg_machine) < 0)
1769 static int setup_journal(const char *directory) {
1770 sd_id128_t machine_id, this_id;
1771 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1775 /* Don't link journals in ephemeral mode */
1779 p = strappend(directory, "/etc/machine-id");
1783 r = read_one_line_file(p, &b);
1784 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1787 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1790 if (isempty(id) && arg_link_journal == LINK_AUTO)
1793 /* Verify validity */
1794 r = sd_id128_from_string(id, &machine_id);
1796 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1798 r = sd_id128_get_machine(&this_id);
1800 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1802 if (sd_id128_equal(machine_id, this_id)) {
1803 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1804 "Host and machine ids are equal (%s): refusing to link journals", id);
1805 if (arg_link_journal == LINK_AUTO)
1810 if (arg_link_journal == LINK_NO)
1814 p = strappend("/var/log/journal/", id);
1815 q = strjoin(directory, "/var/log/journal/", id, NULL);
1819 if (path_is_mount_point(p, false) > 0) {
1820 if (arg_link_journal != LINK_AUTO) {
1821 log_error("%s: already a mount point, refusing to use for journal", p);
1828 if (path_is_mount_point(q, false) > 0) {
1829 if (arg_link_journal != LINK_AUTO) {
1830 log_error("%s: already a mount point, refusing to use for journal", q);
1837 r = readlink_and_make_absolute(p, &d);
1839 if ((arg_link_journal == LINK_GUEST ||
1840 arg_link_journal == LINK_AUTO) &&
1843 r = mkdir_p(q, 0755);
1845 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1850 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1851 } else if (r == -EINVAL) {
1853 if (arg_link_journal == LINK_GUEST &&
1856 if (errno == ENOTDIR) {
1857 log_error("%s already exists and is neither a symlink nor a directory", p);
1860 log_error_errno(errno, "Failed to remove %s: %m", p);
1864 } else if (r != -ENOENT) {
1865 log_error_errno(errno, "readlink(%s) failed: %m", p);
1869 if (arg_link_journal == LINK_GUEST) {
1871 if (symlink(q, p) < 0) {
1872 if (arg_link_journal_try) {
1873 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1876 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1881 r = mkdir_p(q, 0755);
1883 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1887 if (arg_link_journal == LINK_HOST) {
1888 /* don't create parents here -- if the host doesn't have
1889 * permanent journal set up, don't force it here */
1892 if (arg_link_journal_try) {
1893 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1896 log_error_errno(errno, "Failed to create %s: %m", p);
1901 } else if (access(p, F_OK) < 0)
1904 if (dir_is_empty(q) == 0)
1905 log_warning("%s is not empty, proceeding anyway.", q);
1907 r = mkdir_p(q, 0755);
1909 log_error_errno(errno, "Failed to create %s: %m", q);
1913 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1914 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1919 static int drop_capabilities(void) {
1920 return capability_bounding_set_drop(~arg_retain, false);
1923 static int register_machine(pid_t pid, int local_ifindex) {
1924 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1925 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1931 r = sd_bus_default_system(&bus);
1933 return log_error_errno(r, "Failed to open system bus: %m");
1935 if (arg_keep_unit) {
1936 r = sd_bus_call_method(
1938 "org.freedesktop.machine1",
1939 "/org/freedesktop/machine1",
1940 "org.freedesktop.machine1.Manager",
1941 "RegisterMachineWithNetwork",
1946 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1950 strempty(arg_directory),
1951 local_ifindex > 0 ? 1 : 0, local_ifindex);
1953 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1956 r = sd_bus_message_new_method_call(
1959 "org.freedesktop.machine1",
1960 "/org/freedesktop/machine1",
1961 "org.freedesktop.machine1.Manager",
1962 "CreateMachineWithNetwork");
1964 return bus_log_create_error(r);
1966 r = sd_bus_message_append(
1970 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1974 strempty(arg_directory),
1975 local_ifindex > 0 ? 1 : 0, local_ifindex);
1977 return bus_log_create_error(r);
1979 r = sd_bus_message_open_container(m, 'a', "(sv)");
1981 return bus_log_create_error(r);
1983 if (!isempty(arg_slice)) {
1984 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1986 return bus_log_create_error(r);
1989 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1991 return bus_log_create_error(r);
1993 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1994 /* Allow the container to
1995 * access and create the API
1996 * device nodes, so that
1997 * PrivateDevices= in the
1998 * container can work
2003 "/dev/random", "rwm",
2004 "/dev/urandom", "rwm",
2006 "/dev/net/tun", "rwm",
2007 /* Allow the container
2008 * access to ptys. However,
2010 * container to ever create
2011 * these device nodes. */
2012 "/dev/pts/ptmx", "rw",
2015 return log_error_errno(r, "Failed to add device whitelist: %m");
2017 STRV_FOREACH(i, arg_property) {
2018 r = sd_bus_message_open_container(m, 'r', "sv");
2020 return bus_log_create_error(r);
2022 r = bus_append_unit_property_assignment(m, *i);
2026 r = sd_bus_message_close_container(m);
2028 return bus_log_create_error(r);
2031 r = sd_bus_message_close_container(m);
2033 return bus_log_create_error(r);
2035 r = sd_bus_call(bus, m, 0, &error, NULL);
2039 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2046 static int terminate_machine(pid_t pid) {
2047 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2048 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2049 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2056 r = sd_bus_default_system(&bus);
2058 return log_error_errno(r, "Failed to open system bus: %m");
2060 r = sd_bus_call_method(
2062 "org.freedesktop.machine1",
2063 "/org/freedesktop/machine1",
2064 "org.freedesktop.machine1.Manager",
2071 /* Note that the machine might already have been
2072 * cleaned up automatically, hence don't consider it a
2073 * failure if we cannot get the machine object. */
2074 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2078 r = sd_bus_message_read(reply, "o", &path);
2080 return bus_log_parse_error(r);
2082 r = sd_bus_call_method(
2084 "org.freedesktop.machine1",
2086 "org.freedesktop.machine1.Machine",
2092 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2099 static int reset_audit_loginuid(void) {
2100 _cleanup_free_ char *p = NULL;
2103 if (arg_share_system)
2106 r = read_one_line_file("/proc/self/loginuid", &p);
2110 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2112 /* Already reset? */
2113 if (streq(p, "4294967295"))
2116 r = write_string_file("/proc/self/loginuid", "4294967295");
2118 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2119 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2120 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2121 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2122 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2130 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2131 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2132 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2134 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2140 l = strlen(arg_machine);
2141 sz = sizeof(sd_id128_t) + l;
2147 /* fetch some persistent data unique to the host */
2148 r = sd_id128_get_machine((sd_id128_t*) v);
2152 /* combine with some data unique (on this host) to this
2153 * container instance */
2154 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2157 memcpy(i, &idx, sizeof(idx));
2160 /* Let's hash the host machine ID plus the container name. We
2161 * use a fixed, but originally randomly created hash key here. */
2162 siphash24(result, v, sz, hash_key.bytes);
2164 assert_cc(ETH_ALEN <= sizeof(result));
2165 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2167 /* see eth_random_addr in the kernel */
2168 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2169 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2174 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2175 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2176 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2177 struct ether_addr mac_host, mac_container;
2180 if (!arg_private_network)
2183 if (!arg_network_veth)
2186 /* Use two different interface name prefixes depending whether
2187 * we are in bridge mode or not. */
2188 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2189 arg_network_bridge ? "vb" : "ve", arg_machine);
2191 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2193 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2195 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2197 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2199 r = sd_rtnl_open(&rtnl, 0);
2201 return log_error_errno(r, "Failed to connect to netlink: %m");
2203 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2205 return log_error_errno(r, "Failed to allocate netlink message: %m");
2207 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2209 return log_error_errno(r, "Failed to add netlink interface name: %m");
2211 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2213 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2215 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2217 return log_error_errno(r, "Failed to open netlink container: %m");
2219 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2221 return log_error_errno(r, "Failed to open netlink container: %m");
2223 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2225 return log_error_errno(r, "Failed to open netlink container: %m");
2227 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2229 return log_error_errno(r, "Failed to add netlink interface name: %m");
2231 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2233 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2235 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2237 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2239 r = sd_rtnl_message_close_container(m);
2241 return log_error_errno(r, "Failed to close netlink container: %m");
2243 r = sd_rtnl_message_close_container(m);
2245 return log_error_errno(r, "Failed to close netlink container: %m");
2247 r = sd_rtnl_message_close_container(m);
2249 return log_error_errno(r, "Failed to close netlink container: %m");
2251 r = sd_rtnl_call(rtnl, m, 0, NULL);
2253 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2255 i = (int) if_nametoindex(iface_name);
2257 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2264 static int setup_bridge(const char veth_name[], int *ifi) {
2265 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2266 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2269 if (!arg_private_network)
2272 if (!arg_network_veth)
2275 if (!arg_network_bridge)
2278 bridge = (int) if_nametoindex(arg_network_bridge);
2280 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2284 r = sd_rtnl_open(&rtnl, 0);
2286 return log_error_errno(r, "Failed to connect to netlink: %m");
2288 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2290 return log_error_errno(r, "Failed to allocate netlink message: %m");
2292 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2294 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2296 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2298 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2300 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2302 return log_error_errno(r, "Failed to add netlink master field: %m");
2304 r = sd_rtnl_call(rtnl, m, 0, NULL);
2306 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2311 static int parse_interface(struct udev *udev, const char *name) {
2312 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2313 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2316 ifi = (int) if_nametoindex(name);
2318 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2320 sprintf(ifi_str, "n%i", ifi);
2321 d = udev_device_new_from_device_id(udev, ifi_str);
2323 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2325 if (udev_device_get_is_initialized(d) <= 0) {
2326 log_error("Network interface %s is not initialized yet.", name);
2333 static int move_network_interfaces(pid_t pid) {
2334 _cleanup_udev_unref_ struct udev *udev = NULL;
2335 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2339 if (!arg_private_network)
2342 if (strv_isempty(arg_network_interfaces))
2345 r = sd_rtnl_open(&rtnl, 0);
2347 return log_error_errno(r, "Failed to connect to netlink: %m");
2351 log_error("Failed to connect to udev.");
2355 STRV_FOREACH(i, arg_network_interfaces) {
2356 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2359 ifi = parse_interface(udev, *i);
2363 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2365 return log_error_errno(r, "Failed to allocate netlink message: %m");
2367 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2369 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2371 r = sd_rtnl_call(rtnl, m, 0, NULL);
2373 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2379 static int setup_macvlan(pid_t pid) {
2380 _cleanup_udev_unref_ struct udev *udev = NULL;
2381 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2386 if (!arg_private_network)
2389 if (strv_isempty(arg_network_macvlan))
2392 r = sd_rtnl_open(&rtnl, 0);
2394 return log_error_errno(r, "Failed to connect to netlink: %m");
2398 log_error("Failed to connect to udev.");
2402 STRV_FOREACH(i, arg_network_macvlan) {
2403 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2404 _cleanup_free_ char *n = NULL;
2405 struct ether_addr mac;
2408 ifi = parse_interface(udev, *i);
2412 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2414 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2416 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2418 return log_error_errno(r, "Failed to allocate netlink message: %m");
2420 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2422 return log_error_errno(r, "Failed to add netlink interface index: %m");
2424 n = strappend("mv-", *i);
2428 strshorten(n, IFNAMSIZ-1);
2430 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2432 return log_error_errno(r, "Failed to add netlink interface name: %m");
2434 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2436 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2438 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2440 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2442 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2444 return log_error_errno(r, "Failed to open netlink container: %m");
2446 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2448 return log_error_errno(r, "Failed to open netlink container: %m");
2450 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2452 return log_error_errno(r, "Failed to append macvlan mode: %m");
2454 r = sd_rtnl_message_close_container(m);
2456 return log_error_errno(r, "Failed to close netlink container: %m");
2458 r = sd_rtnl_message_close_container(m);
2460 return log_error_errno(r, "Failed to close netlink container: %m");
2462 r = sd_rtnl_call(rtnl, m, 0, NULL);
2464 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2470 static int setup_ipvlan(pid_t pid) {
2471 _cleanup_udev_unref_ struct udev *udev = NULL;
2472 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2476 if (!arg_private_network)
2479 if (strv_isempty(arg_network_ipvlan))
2482 r = sd_rtnl_open(&rtnl, 0);
2484 return log_error_errno(r, "Failed to connect to netlink: %m");
2488 log_error("Failed to connect to udev.");
2492 STRV_FOREACH(i, arg_network_ipvlan) {
2493 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2494 _cleanup_free_ char *n = NULL;
2497 ifi = parse_interface(udev, *i);
2501 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2503 return log_error_errno(r, "Failed to allocate netlink message: %m");
2505 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2507 return log_error_errno(r, "Failed to add netlink interface index: %m");
2509 n = strappend("iv-", *i);
2513 strshorten(n, IFNAMSIZ-1);
2515 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2517 return log_error_errno(r, "Failed to add netlink interface name: %m");
2519 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2521 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2523 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2525 return log_error_errno(r, "Failed to open netlink container: %m");
2527 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2529 return log_error_errno(r, "Failed to open netlink container: %m");
2531 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2533 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2535 r = sd_rtnl_message_close_container(m);
2537 return log_error_errno(r, "Failed to close netlink container: %m");
2539 r = sd_rtnl_message_close_container(m);
2541 return log_error_errno(r, "Failed to close netlink container: %m");
2543 r = sd_rtnl_call(rtnl, m, 0, NULL);
2545 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2551 static int setup_seccomp(void) {
2554 static const int blacklist[] = {
2555 SCMP_SYS(kexec_load),
2556 SCMP_SYS(open_by_handle_at),
2563 static const int kmod_blacklist[] = {
2564 SCMP_SYS(init_module),
2565 SCMP_SYS(finit_module),
2566 SCMP_SYS(delete_module),
2569 scmp_filter_ctx seccomp;
2573 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2577 r = seccomp_add_secondary_archs(seccomp);
2579 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2583 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2584 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2586 continue; /* unknown syscall */
2588 log_error_errno(r, "Failed to block syscall: %m");
2593 /* If the CAP_SYS_MODULE capability is not requested then
2594 * we'll block the kmod syscalls too */
2595 if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2596 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2597 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2599 continue; /* unknown syscall */
2601 log_error_errno(r, "Failed to block syscall: %m");
2608 Audit is broken in containers, much of the userspace audit
2609 hookup will fail if running inside a container. We don't
2610 care and just turn off creation of audit sockets.
2612 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2613 with EAFNOSUPPORT which audit userspace uses as indication
2614 that audit is disabled in the kernel.
2617 r = seccomp_rule_add(
2619 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2622 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2623 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2625 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2629 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2631 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2635 r = seccomp_load(seccomp);
2637 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2640 seccomp_release(seccomp);
2648 static int setup_propagate(const char *root) {
2651 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2652 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2653 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2654 (void) mkdir_p(p, 0600);
2656 q = strjoina(root, "/run/systemd/nspawn/incoming");
2657 mkdir_parents(q, 0755);
2660 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2661 return log_error_errno(errno, "Failed to install propagation bind mount.");
2663 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2664 return log_error_errno(errno, "Failed to make propagation mount read-only");
2669 static int setup_image(char **device_path, int *loop_nr) {
2670 struct loop_info64 info = {
2671 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2673 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2674 _cleanup_free_ char* loopdev = NULL;
2678 assert(device_path);
2682 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2684 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2686 if (fstat(fd, &st) < 0)
2687 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2689 if (S_ISBLK(st.st_mode)) {
2692 p = strdup(arg_image);
2706 if (!S_ISREG(st.st_mode)) {
2707 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2711 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2713 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2715 nr = ioctl(control, LOOP_CTL_GET_FREE);
2717 return log_error_errno(errno, "Failed to allocate loop device: %m");
2719 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2722 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2724 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2726 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2727 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2730 info.lo_flags |= LO_FLAGS_READ_ONLY;
2732 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2733 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2735 *device_path = loopdev;
2746 #define PARTITION_TABLE_BLURB \
2747 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2748 "type 0x83 that is marked bootable, or a single GPT partition of type " \
2749 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2750 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2751 "to be bootable with systemd-nspawn."
2753 static int dissect_image(
2755 char **root_device, bool *root_device_rw,
2756 char **home_device, bool *home_device_rw,
2757 char **srv_device, bool *srv_device_rw,
2761 int home_nr = -1, srv_nr = -1;
2762 #ifdef GPT_ROOT_NATIVE
2765 #ifdef GPT_ROOT_SECONDARY
2766 int secondary_root_nr = -1;
2768 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2769 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2770 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2771 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2772 _cleanup_udev_unref_ struct udev *udev = NULL;
2773 struct udev_list_entry *first, *item;
2774 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2775 bool is_gpt, is_mbr, multiple_generic = false;
2776 const char *pttype = NULL;
2783 assert(root_device);
2784 assert(home_device);
2789 b = blkid_new_probe();
2794 r = blkid_probe_set_device(b, fd, 0, 0);
2799 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2803 blkid_probe_enable_partitions(b, 1);
2804 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2807 r = blkid_do_safeprobe(b);
2808 if (r == -2 || r == 1) {
2809 log_error("Failed to identify any partition table on\n"
2811 PARTITION_TABLE_BLURB, arg_image);
2813 } else if (r != 0) {
2816 log_error_errno(errno, "Failed to probe: %m");
2820 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2822 is_gpt = streq_ptr(pttype, "gpt");
2823 is_mbr = streq_ptr(pttype, "dos");
2825 if (!is_gpt && !is_mbr) {
2826 log_error("No GPT or MBR partition table discovered on\n"
2828 PARTITION_TABLE_BLURB, arg_image);
2833 pl = blkid_probe_get_partitions(b);
2838 log_error("Failed to list partitions of %s", arg_image);
2846 if (fstat(fd, &st) < 0)
2847 return log_error_errno(errno, "Failed to stat block device: %m");
2849 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2857 log_error("Kernel partitions never appeared.");
2861 e = udev_enumerate_new(udev);
2865 r = udev_enumerate_add_match_parent(e, d);
2869 r = udev_enumerate_scan_devices(e);
2871 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2873 /* Count the partitions enumerated by the kernel */
2875 first = udev_enumerate_get_list_entry(e);
2876 udev_list_entry_foreach(item, first)
2879 /* Count the partitions enumerated by blkid */
2880 m = blkid_partlist_numof_partitions(pl);
2884 log_error("blkid and kernel partition list do not match.");
2890 /* The kernel has probed fewer partitions than
2891 * blkid? Maybe the kernel prober is still
2892 * running or it got EBUSY because udev
2893 * already opened the device. Let's reprobe
2894 * the device, which is a synchronous call
2895 * that waits until probing is complete. */
2897 for (j = 0; j < 20; j++) {
2899 r = ioctl(fd, BLKRRPART, 0);
2902 if (r >= 0 || r != -EBUSY)
2905 /* If something else has the device
2906 * open, such as an udev rule, the
2907 * ioctl will return EBUSY. Since
2908 * there's no way to wait until it
2909 * isn't busy anymore, let's just wait
2910 * a bit, and try again.
2912 * This is really something they
2913 * should fix in the kernel! */
2915 usleep(50 * USEC_PER_MSEC);
2919 return log_error_errno(r, "Failed to reread partition table: %m");
2922 e = udev_enumerate_unref(e);
2925 first = udev_enumerate_get_list_entry(e);
2926 udev_list_entry_foreach(item, first) {
2927 _cleanup_udev_device_unref_ struct udev_device *q;
2929 unsigned long long flags;
2935 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2940 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2944 qn = udev_device_get_devnum(q);
2948 if (st.st_rdev == qn)
2951 node = udev_device_get_devnode(q);
2955 pp = blkid_partlist_devno_to_partition(pl, qn);
2959 flags = blkid_partition_get_flags(pp);
2961 nr = blkid_partition_get_partno(pp);
2969 if (flags & GPT_FLAG_NO_AUTO)
2972 stype = blkid_partition_get_type_string(pp);
2976 if (sd_id128_from_string(stype, &type_id) < 0)
2979 if (sd_id128_equal(type_id, GPT_HOME)) {
2981 if (home && nr >= home_nr)
2985 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2987 r = free_and_strdup(&home, node);
2991 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2993 if (srv && nr >= srv_nr)
2997 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2999 r = free_and_strdup(&srv, node);
3003 #ifdef GPT_ROOT_NATIVE
3004 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3006 if (root && nr >= root_nr)
3010 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3012 r = free_and_strdup(&root, node);
3017 #ifdef GPT_ROOT_SECONDARY
3018 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3020 if (secondary_root && nr >= secondary_root_nr)
3023 secondary_root_nr = nr;
3024 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3026 r = free_and_strdup(&secondary_root, node);
3031 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3034 multiple_generic = true;
3036 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3038 r = free_and_strdup(&generic, node);
3044 } else if (is_mbr) {
3047 if (flags != 0x80) /* Bootable flag */
3050 type = blkid_partition_get_type(pp);
3051 if (type != 0x83) /* Linux partition */
3055 multiple_generic = true;
3059 r = free_and_strdup(&root, node);
3067 *root_device = root;
3070 *root_device_rw = root_rw;
3072 } else if (secondary_root) {
3073 *root_device = secondary_root;
3074 secondary_root = NULL;
3076 *root_device_rw = secondary_root_rw;
3078 } else if (generic) {
3080 /* There were no partitions with precise meanings
3081 * around, but we found generic partitions. In this
3082 * case, if there's only one, we can go ahead and boot
3083 * it, otherwise we bail out, because we really cannot
3084 * make any sense of it. */
3086 if (multiple_generic) {
3087 log_error("Identified multiple bootable Linux partitions on\n"
3089 PARTITION_TABLE_BLURB, arg_image);
3093 *root_device = generic;
3096 *root_device_rw = generic_rw;
3099 log_error("Failed to identify root partition in disk image\n"
3101 PARTITION_TABLE_BLURB, arg_image);
3106 *home_device = home;
3109 *home_device_rw = home_rw;
3116 *srv_device_rw = srv_rw;
3121 log_error("--image= is not supported, compiled without blkid support.");
3126 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3128 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3129 const char *fstype, *p;
3139 p = strjoina(where, directory);
3144 b = blkid_new_probe_from_filename(what);
3148 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3152 blkid_probe_enable_superblocks(b, 1);
3153 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3156 r = blkid_do_safeprobe(b);
3157 if (r == -1 || r == 1) {
3158 log_error("Cannot determine file system type of %s", what);
3160 } else if (r != 0) {
3163 log_error_errno(errno, "Failed to probe %s: %m", what);
3168 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3171 log_error("Failed to determine file system type of %s", what);
3175 if (streq(fstype, "crypto_LUKS")) {
3176 log_error("nspawn currently does not support LUKS disk images.");
3180 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3181 return log_error_errno(errno, "Failed to mount %s: %m", what);
3185 log_error("--image= is not supported, compiled without blkid support.");
3190 static int mount_devices(
3192 const char *root_device, bool root_device_rw,
3193 const char *home_device, bool home_device_rw,
3194 const char *srv_device, bool srv_device_rw) {
3200 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3202 return log_error_errno(r, "Failed to mount root directory: %m");
3206 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3208 return log_error_errno(r, "Failed to mount home directory: %m");
3212 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3214 return log_error_errno(r, "Failed to mount server data directory: %m");
3220 static void loop_remove(int nr, int *image_fd) {
3221 _cleanup_close_ int control = -1;
3227 if (image_fd && *image_fd >= 0) {
3228 r = ioctl(*image_fd, LOOP_CLR_FD);
3230 log_debug_errno(errno, "Failed to close loop image: %m");
3231 *image_fd = safe_close(*image_fd);
3234 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3236 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3240 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3242 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3245 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3253 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3254 return log_error_errno(errno, "Failed to allocate pipe: %m");
3258 return log_error_errno(errno, "Failed to fork getent child: %m");
3259 else if (pid == 0) {
3261 char *empty_env = NULL;
3263 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3264 _exit(EXIT_FAILURE);
3266 if (pipe_fds[0] > 2)
3267 safe_close(pipe_fds[0]);
3268 if (pipe_fds[1] > 2)
3269 safe_close(pipe_fds[1]);
3271 nullfd = open("/dev/null", O_RDWR);
3273 _exit(EXIT_FAILURE);
3275 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3276 _exit(EXIT_FAILURE);
3278 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3279 _exit(EXIT_FAILURE);
3284 reset_all_signal_handlers();
3285 close_all_fds(NULL, 0);
3287 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3288 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3289 _exit(EXIT_FAILURE);
3292 pipe_fds[1] = safe_close(pipe_fds[1]);
3299 static int change_uid_gid(char **_home) {
3300 char line[LINE_MAX], *x, *u, *g, *h;
3301 const char *word, *state;
3302 _cleanup_free_ uid_t *uids = NULL;
3303 _cleanup_free_ char *home = NULL;
3304 _cleanup_fclose_ FILE *f = NULL;
3305 _cleanup_close_ int fd = -1;
3306 unsigned n_uids = 0;
3315 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3316 /* Reset everything fully to 0, just in case */
3318 if (setgroups(0, NULL) < 0)
3319 return log_error_errno(errno, "setgroups() failed: %m");
3321 if (setresgid(0, 0, 0) < 0)
3322 return log_error_errno(errno, "setregid() failed: %m");
3324 if (setresuid(0, 0, 0) < 0)
3325 return log_error_errno(errno, "setreuid() failed: %m");
3331 /* First, get user credentials */
3332 fd = spawn_getent("passwd", arg_user, &pid);
3336 f = fdopen(fd, "r");
3341 if (!fgets(line, sizeof(line), f)) {
3344 log_error("Failed to resolve user %s.", arg_user);
3348 log_error_errno(errno, "Failed to read from getent: %m");
3354 wait_for_terminate_and_warn("getent passwd", pid, true);
3356 x = strchr(line, ':');
3358 log_error("/etc/passwd entry has invalid user field.");
3362 u = strchr(x+1, ':');
3364 log_error("/etc/passwd entry has invalid password field.");
3371 log_error("/etc/passwd entry has invalid UID field.");
3379 log_error("/etc/passwd entry has invalid GID field.");
3384 h = strchr(x+1, ':');
3386 log_error("/etc/passwd entry has invalid GECOS field.");
3393 log_error("/etc/passwd entry has invalid home directory field.");
3399 r = parse_uid(u, &uid);
3401 log_error("Failed to parse UID of user.");
3405 r = parse_gid(g, &gid);
3407 log_error("Failed to parse GID of user.");
3415 /* Second, get group memberships */
3416 fd = spawn_getent("initgroups", arg_user, &pid);
3421 f = fdopen(fd, "r");
3426 if (!fgets(line, sizeof(line), f)) {
3428 log_error("Failed to resolve user %s.", arg_user);
3432 log_error_errno(errno, "Failed to read from getent: %m");
3438 wait_for_terminate_and_warn("getent initgroups", pid, true);
3440 /* Skip over the username and subsequent separator whitespace */
3442 x += strcspn(x, WHITESPACE);
3443 x += strspn(x, WHITESPACE);
3445 FOREACH_WORD(word, l, x, state) {
3451 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3454 r = parse_uid(c, &uids[n_uids++]);
3456 log_error("Failed to parse group data from getent.");
3461 r = mkdir_parents(home, 0775);
3463 return log_error_errno(r, "Failed to make home root directory: %m");
3465 r = mkdir_safe(home, 0755, uid, gid);
3466 if (r < 0 && r != -EEXIST)
3467 return log_error_errno(r, "Failed to make home directory: %m");
3469 fchown(STDIN_FILENO, uid, gid);
3470 fchown(STDOUT_FILENO, uid, gid);
3471 fchown(STDERR_FILENO, uid, gid);
3473 if (setgroups(n_uids, uids) < 0)
3474 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3476 if (setresgid(gid, gid, gid) < 0)
3477 return log_error_errno(errno, "setregid() failed: %m");
3479 if (setresuid(uid, uid, uid) < 0)
3480 return log_error_errno(errno, "setreuid() failed: %m");
3492 * < 0 : wait_for_terminate() failed to get the state of the
3493 * container, the container was terminated by a signal, or
3494 * failed for an unknown reason. No change is made to the
3495 * container argument.
3496 * > 0 : The program executed in the container terminated with an
3497 * error. The exit code of the program executed in the
3498 * container is returned. The container argument has been set
3499 * to CONTAINER_TERMINATED.
3500 * 0 : The container is being rebooted, has been shut down or exited
3501 * successfully. The container argument has been set to either
3502 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3504 * That is, success is indicated by a return value of zero, and an
3505 * error is indicated by a non-zero value.
3507 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3511 r = wait_for_terminate(pid, &status);
3513 return log_warning_errno(r, "Failed to wait for container: %m");
3515 switch (status.si_code) {
3518 if (status.si_status == 0) {
3519 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3522 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3524 *container = CONTAINER_TERMINATED;
3525 return status.si_status;
3528 if (status.si_status == SIGINT) {
3530 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3531 *container = CONTAINER_TERMINATED;
3534 } else if (status.si_status == SIGHUP) {
3536 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3537 *container = CONTAINER_REBOOTED;
3541 /* CLD_KILLED fallthrough */
3544 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3548 log_error("Container %s failed due to unknown reason.", arg_machine);
3555 static void nop_handler(int sig) {}
3557 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3560 pid = PTR_TO_UINT32(userdata);
3562 if (kill(pid, SIGRTMIN+3) >= 0) {
3563 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3564 sd_event_source_set_userdata(s, NULL);
3569 sd_event_exit(sd_event_source_get_event(s), 0);
3573 static int determine_names(void) {
3576 if (!arg_image && !arg_directory) {
3578 _cleanup_(image_unrefp) Image *i = NULL;
3580 r = image_find(arg_machine, &i);
3582 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3584 log_error("No image for machine '%s': %m", arg_machine);
3588 if (i->type == IMAGE_RAW)
3589 r = set_sanitized_path(&arg_image, i->path);
3591 r = set_sanitized_path(&arg_directory, i->path);
3593 return log_error_errno(r, "Invalid image directory: %m");
3595 arg_read_only = arg_read_only || i->read_only;
3597 arg_directory = get_current_dir_name();
3599 if (!arg_directory && !arg_machine) {
3600 log_error("Failed to determine path, please use -D or -i.");
3606 if (arg_directory && path_equal(arg_directory, "/"))
3607 arg_machine = gethostname_malloc();
3609 arg_machine = strdup(basename(arg_image ?: arg_directory));
3614 hostname_cleanup(arg_machine, false);
3615 if (!machine_name_is_valid(arg_machine)) {
3616 log_error("Failed to determine machine name automatically, please use -M.");
3620 if (arg_ephemeral) {
3623 /* Add a random suffix when this is an
3624 * ephemeral machine, so that we can run many
3625 * instances at once without manually having
3626 * to specify -M each time. */
3628 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3639 static int determine_uid_shift(void) {
3645 if (arg_uid_shift == UID_INVALID) {
3648 r = stat(arg_directory, &st);
3650 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3652 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3654 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3655 log_error("UID and GID base of %s don't match.", arg_directory);
3659 arg_uid_range = UINT32_C(0x10000);
3662 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3663 log_error("UID base too high for UID range.");
3667 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3671 int main(int argc, char *argv[]) {
3673 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3674 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3675 _cleanup_close_ int master = -1, image_fd = -1;
3676 _cleanup_fdset_free_ FDSet *fds = NULL;
3677 int r, n_fd_passed, loop_nr = -1;
3678 char veth_name[IFNAMSIZ];
3679 bool secondary = false, remove_subvol = false;
3680 sigset_t mask, mask_chld;
3682 int ret = EXIT_SUCCESS;
3683 union in_addr_union exposed = {};
3684 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3687 log_parse_environment();
3690 r = parse_argv(argc, argv);
3694 r = determine_names();
3698 if (geteuid() != 0) {
3699 log_error("Need to be root.");
3704 if (sd_booted() <= 0) {
3705 log_error("Not running on a systemd system.");
3711 n_fd_passed = sd_listen_fds(false);
3712 if (n_fd_passed > 0) {
3713 r = fdset_new_listen_fds(&fds, false);
3715 log_error_errno(r, "Failed to collect file descriptors: %m");
3719 fdset_close_others(fds);
3722 if (arg_directory) {
3725 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3726 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3731 if (arg_ephemeral) {
3734 /* If the specified path is a mount point we
3735 * generate the new snapshot immediately
3736 * inside it under a random name. However if
3737 * the specified is not a mount point we
3738 * create the new snapshot in the parent
3739 * directory, just next to it. */
3740 r = path_is_mount_point(arg_directory, false);
3742 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3746 r = tempfn_random_child(arg_directory, &np);
3748 r = tempfn_random(arg_directory, &np);
3750 log_error_errno(r, "Failed to generate name for snapshot: %m");
3754 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3756 log_error_errno(r, "Failed to lock %s: %m", np);
3760 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3763 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3767 free(arg_directory);
3770 remove_subvol = true;
3773 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3775 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3779 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3784 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3787 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3789 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3793 log_info("Populated %s from template %s.", arg_directory, arg_template);
3799 if (path_is_os_tree(arg_directory) <= 0) {
3800 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3807 p = strjoina(arg_directory,
3808 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3809 if (access(p, F_OK) < 0) {
3810 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3817 char template[] = "/tmp/nspawn-root-XXXXXX";
3820 assert(!arg_template);
3822 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3824 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3828 r = log_error_errno(r, "Failed to create image lock: %m");
3832 if (!mkdtemp(template)) {
3833 log_error_errno(errno, "Failed to create temporary directory: %m");
3838 arg_directory = strdup(template);
3839 if (!arg_directory) {
3844 image_fd = setup_image(&device_path, &loop_nr);
3850 r = dissect_image(image_fd,
3851 &root_device, &root_device_rw,
3852 &home_device, &home_device_rw,
3853 &srv_device, &srv_device_rw,
3859 r = determine_uid_shift();
3863 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3865 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3867 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3871 r = ptsname_malloc(master, &console);
3873 r = log_error_errno(r, "Failed to determine tty name: %m");
3877 if (unlockpt(master) < 0) {
3878 r = log_error_errno(errno, "Failed to unlock tty: %m");
3883 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3884 arg_machine, arg_image ?: arg_directory);
3886 assert_se(sigemptyset(&mask) == 0);
3887 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3888 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3890 assert_se(sigemptyset(&mask_chld) == 0);
3891 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3894 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3895 ContainerStatus container_status;
3896 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3897 struct sigaction sa = {
3898 .sa_handler = nop_handler,
3899 .sa_flags = SA_NOCLDSTOP,
3902 r = barrier_create(&barrier);
3904 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3908 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3909 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3913 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3914 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3918 /* Child can be killed before execv(), so handle SIGCHLD
3919 * in order to interrupt parent's blocking calls and
3920 * give it a chance to call wait() and terminate. */
3921 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3923 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3927 r = sigaction(SIGCHLD, &sa, NULL);
3929 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3933 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3934 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3935 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3937 if (errno == EINVAL)
3938 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3940 r = log_error_errno(errno, "clone() failed: %m");
3947 _cleanup_free_ char *home = NULL;
3949 const char *envp[] = {
3950 "PATH=" DEFAULT_PATH_SPLIT_USR,
3951 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3956 NULL, /* container_uuid */
3957 NULL, /* LISTEN_FDS */
3958 NULL, /* LISTEN_PID */
3963 barrier_set_role(&barrier, BARRIER_CHILD);
3965 envp[n_env] = strv_find_prefix(environ, "TERM=");
3969 master = safe_close(master);
3971 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3972 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3974 reset_all_signal_handlers();
3975 reset_signal_mask();
3978 close_nointr(STDIN_FILENO);
3979 close_nointr(STDOUT_FILENO);
3980 close_nointr(STDERR_FILENO);
3982 r = open_terminal(console, O_RDWR);
3983 if (r != STDIN_FILENO) {
3989 log_error_errno(r, "Failed to open console: %m");
3990 _exit(EXIT_FAILURE);
3993 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3994 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3995 log_error_errno(errno, "Failed to duplicate console: %m");
3996 _exit(EXIT_FAILURE);
4001 log_error_errno(errno, "setsid() failed: %m");
4002 _exit(EXIT_FAILURE);
4005 if (reset_audit_loginuid() < 0)
4006 _exit(EXIT_FAILURE);
4008 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4009 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4010 _exit(EXIT_FAILURE);
4013 if (arg_private_network)
4016 /* Mark everything as slave, so that we still
4017 * receive mounts from the real root, but don't
4018 * propagate mounts to the real root. */
4019 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4020 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4021 _exit(EXIT_FAILURE);
4024 if (mount_devices(arg_directory,
4025 root_device, root_device_rw,
4026 home_device, home_device_rw,
4027 srv_device, srv_device_rw) < 0)
4028 _exit(EXIT_FAILURE);
4030 /* Turn directory into bind mount */
4031 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4032 log_error_errno(errno, "Failed to make bind mount: %m");
4033 _exit(EXIT_FAILURE);
4036 r = setup_volatile(arg_directory);
4038 _exit(EXIT_FAILURE);
4040 if (setup_volatile_state(arg_directory) < 0)
4041 _exit(EXIT_FAILURE);
4043 r = base_filesystem_create(arg_directory);
4045 _exit(EXIT_FAILURE);
4047 if (arg_read_only) {
4048 r = bind_remount_recursive(arg_directory, true);
4050 log_error_errno(r, "Failed to make tree read-only: %m");
4051 _exit(EXIT_FAILURE);
4055 if (mount_all(arg_directory) < 0)
4056 _exit(EXIT_FAILURE);
4058 if (copy_devnodes(arg_directory) < 0)
4059 _exit(EXIT_FAILURE);
4061 if (setup_ptmx(arg_directory) < 0)
4062 _exit(EXIT_FAILURE);
4064 dev_setup(arg_directory);
4066 if (setup_propagate(arg_directory) < 0)
4067 _exit(EXIT_FAILURE);
4069 if (setup_seccomp() < 0)
4070 _exit(EXIT_FAILURE);
4072 if (setup_dev_console(arg_directory, console) < 0)
4073 _exit(EXIT_FAILURE);
4075 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4076 _exit(EXIT_FAILURE);
4077 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4079 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4080 _exit(EXIT_FAILURE);
4081 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4083 /* Tell the parent that we are ready, and that
4084 * it can cgroupify us to that we lack access
4085 * to certain devices and resources. */
4086 (void) barrier_place(&barrier); /* #1 */
4088 if (setup_boot_id(arg_directory) < 0)
4089 _exit(EXIT_FAILURE);
4091 if (setup_timezone(arg_directory) < 0)
4092 _exit(EXIT_FAILURE);
4094 if (setup_resolv_conf(arg_directory) < 0)
4095 _exit(EXIT_FAILURE);
4097 if (setup_journal(arg_directory) < 0)
4098 _exit(EXIT_FAILURE);
4100 if (mount_binds(arg_directory, arg_bind, false) < 0)
4101 _exit(EXIT_FAILURE);
4103 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4104 _exit(EXIT_FAILURE);
4106 if (mount_tmpfs(arg_directory) < 0)
4107 _exit(EXIT_FAILURE);
4109 /* Wait until we are cgroup-ified, so that we
4110 * can mount the right cgroup path writable */
4111 (void) barrier_place_and_sync(&barrier); /* #2 */
4113 if (mount_cgroup(arg_directory) < 0)
4114 _exit(EXIT_FAILURE);
4116 if (chdir(arg_directory) < 0) {
4117 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4118 _exit(EXIT_FAILURE);
4121 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4122 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4123 _exit(EXIT_FAILURE);
4126 if (chroot(".") < 0) {
4127 log_error_errno(errno, "chroot() failed: %m");
4128 _exit(EXIT_FAILURE);
4131 if (chdir("/") < 0) {
4132 log_error_errno(errno, "chdir() failed: %m");
4133 _exit(EXIT_FAILURE);
4137 if (unshare(CLONE_NEWUSER) < 0) {
4138 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4139 _exit(EXIT_FAILURE);
4142 /* Tell the parent, that it now can
4143 * write the UID map. */
4144 (void) barrier_place(&barrier); /* #3 */
4146 /* Wait until the parent wrote the UID
4148 (void) barrier_place_and_sync(&barrier); /* #4 */
4153 if (drop_capabilities() < 0) {
4154 log_error_errno(errno, "drop_capabilities() failed: %m");
4155 _exit(EXIT_FAILURE);
4160 if (arg_personality != 0xffffffffLU) {
4161 if (personality(arg_personality) < 0) {
4162 log_error_errno(errno, "personality() failed: %m");
4163 _exit(EXIT_FAILURE);
4165 } else if (secondary) {
4166 if (personality(PER_LINUX32) < 0) {
4167 log_error_errno(errno, "personality() failed: %m");
4168 _exit(EXIT_FAILURE);
4173 if (arg_selinux_context)
4174 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4175 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4176 _exit(EXIT_FAILURE);
4180 r = change_uid_gid(&home);
4182 _exit(EXIT_FAILURE);
4184 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4185 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4186 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4188 _exit(EXIT_FAILURE);
4191 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4194 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4196 _exit(EXIT_FAILURE);
4200 if (fdset_size(fds) > 0) {
4201 r = fdset_cloexec(fds, false);
4203 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4204 _exit(EXIT_FAILURE);
4207 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4208 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4210 _exit(EXIT_FAILURE);
4214 if (!strv_isempty(arg_setenv)) {
4217 n = strv_env_merge(2, envp, arg_setenv);
4220 _exit(EXIT_FAILURE);
4225 env_use = (char**) envp;
4227 /* Let the parent know that we are ready and
4228 * wait until the parent is ready with the
4230 (void) barrier_place_and_sync(&barrier); /* #5 */
4236 /* Automatically search for the init system */
4238 l = 1 + argc - optind;
4239 a = newa(char*, l + 1);
4240 memcpy(a + 1, argv + optind, l * sizeof(char*));
4242 a[0] = (char*) "/usr/lib/systemd/systemd";
4243 execve(a[0], a, env_use);
4245 a[0] = (char*) "/lib/systemd/systemd";
4246 execve(a[0], a, env_use);
4248 a[0] = (char*) "/sbin/init";
4249 execve(a[0], a, env_use);
4250 } else if (argc > optind)
4251 execvpe(argv[optind], argv + optind, env_use);
4253 chdir(home ? home : "/root");
4254 execle("/bin/bash", "-bash", NULL, env_use);
4255 execle("/bin/sh", "-sh", NULL, env_use);
4258 log_error_errno(errno, "execv() failed: %m");
4259 _exit(EXIT_FAILURE);
4262 barrier_set_role(&barrier, BARRIER_PARENT);
4266 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4267 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4269 (void) barrier_place(&barrier); /* #1 */
4271 /* Wait for the most basic Child-setup to be done,
4272 * before we add hardware to it, and place it in a
4274 if (barrier_sync(&barrier)) { /* #1 */
4277 r = move_network_interfaces(pid);
4281 r = setup_veth(pid, veth_name, &ifi);
4285 r = setup_bridge(veth_name, &ifi);
4289 r = setup_macvlan(pid);
4293 r = setup_ipvlan(pid);
4297 r = register_machine(pid, ifi);
4301 /* Notify the child that the parent is ready with all
4302 * its setup, and that the child can now hand over
4303 * control to the code to run inside the container. */
4304 (void) barrier_place(&barrier); /* #2 */
4307 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4309 (void) barrier_place_and_sync(&barrier); /* #3 */
4311 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4312 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4313 r = write_string_file(uid_map, line);
4315 log_error_errno(r, "Failed to write UID map: %m");
4319 /* We always assign the same UID and GID ranges */
4320 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4321 r = write_string_file(uid_map, line);
4323 log_error_errno(r, "Failed to write GID map: %m");
4327 (void) barrier_place(&barrier); /* #4 */
4330 /* Block SIGCHLD here, before notifying child.
4331 * process_pty() will handle it with the other signals. */
4332 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4336 /* Reset signal to default */
4337 r = default_signals(SIGCHLD, -1);
4341 /* Let the child know that we are ready and wait that the child is completely ready now. */
4342 if (barrier_place_and_sync(&barrier)) { /* #5 */
4343 _cleanup_event_unref_ sd_event *event = NULL;
4344 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4345 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4350 "STATUS=Container running.\n"
4351 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4353 r = sd_event_new(&event);
4355 log_error_errno(r, "Failed to get default event source: %m");
4360 /* Try to kill the init system on SIGINT or SIGTERM */
4361 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4362 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4364 /* Immediately exit */
4365 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4366 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4369 /* simply exit on sigchld */
4370 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4372 if (arg_expose_ports) {
4373 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4377 (void) expose_ports(rtnl, &exposed);
4380 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4382 r = pty_forward_new(event, master, true, !interactive, &forward);
4384 log_error_errno(r, "Failed to create PTY forwarder: %m");
4388 r = sd_event_loop(event);
4390 log_error_errno(r, "Failed to run event loop: %m");
4394 pty_forward_get_last_char(forward, &last_char);
4396 forward = pty_forward_free(forward);
4398 if (!arg_quiet && last_char != '\n')
4401 /* Kill if it is not dead yet anyway */
4402 terminate_machine(pid);
4406 /* Normally redundant, but better safe than sorry */
4409 r = wait_for_container(pid, &container_status);
4413 /* We failed to wait for the container, or the
4414 * container exited abnormally */
4416 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4417 /* The container exited with a non-zero
4418 * status, or with zero status and no reboot
4424 /* CONTAINER_REBOOTED, loop again */
4426 if (arg_keep_unit) {
4427 /* Special handling if we are running as a
4428 * service: instead of simply restarting the
4429 * machine we want to restart the entire
4430 * service, so let's inform systemd about this
4431 * with the special exit code 133. The service
4432 * file uses RestartForceExitStatus=133 so
4433 * that this results in a full nspawn
4434 * restart. This is necessary since we might
4435 * have cgroup parameters set we want to have
4442 flush_ports(&exposed);
4448 "STATUS=Terminating...");
4450 loop_remove(loop_nr, &image_fd);
4455 if (remove_subvol && arg_directory) {
4458 k = btrfs_subvol_remove(arg_directory);
4460 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4466 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4467 (void) rm_rf(p, false, true, false);
4470 free(arg_directory);
4475 strv_free(arg_setenv);
4476 strv_free(arg_network_interfaces);
4477 strv_free(arg_network_macvlan);
4478 strv_free(arg_network_ipvlan);
4479 strv_free(arg_bind);
4480 strv_free(arg_bind_ro);
4481 strv_free(arg_tmpfs);
4483 flush_ports(&exposed);
4485 while (arg_expose_ports) {
4486 ExposePort *p = arg_expose_ports;
4487 LIST_REMOVE(ports, arg_expose_ports, p);
4491 return r < 0 ? EXIT_FAILURE : ret;