1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
36 #include <sys/signalfd.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
89 #include "siphash24.h"
91 #include "base-filesystem.h"
93 #include "event-util.h"
94 #include "capability.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
99 #include "in-addr-util.h"
101 #include "local-addresses.h"
104 #include "seccomp-util.h"
107 typedef struct ExposePort {
110 uint16_t container_port;
111 LIST_FIELDS(struct ExposePort, ports);
114 typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
119 typedef enum LinkJournal {
126 typedef enum Volatile {
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147 (1ULL << CAP_CHOWN) |
148 (1ULL << CAP_DAC_OVERRIDE) |
149 (1ULL << CAP_DAC_READ_SEARCH) |
150 (1ULL << CAP_FOWNER) |
151 (1ULL << CAP_FSETID) |
152 (1ULL << CAP_IPC_OWNER) |
154 (1ULL << CAP_LEASE) |
155 (1ULL << CAP_LINUX_IMMUTABLE) |
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
159 (1ULL << CAP_SETGID) |
160 (1ULL << CAP_SETFCAP) |
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
164 (1ULL << CAP_SYS_CHROOT) |
165 (1ULL << CAP_SYS_NICE) |
166 (1ULL << CAP_SYS_PTRACE) |
167 (1ULL << CAP_SYS_TTY_CONFIG) |
168 (1ULL << CAP_SYS_RESOURCE) |
169 (1ULL << CAP_SYS_BOOT) |
170 (1ULL << CAP_AUDIT_WRITE) |
171 (1ULL << CAP_AUDIT_CONTROL) |
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
191 static void help(void) {
192 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194 " -h --help Show this help\n"
195 " --version Print version string\n"
196 " -q --quiet Do not show status information\n"
197 " -D --directory=PATH Root directory for the container\n"
198 " --template=PATH Initialize root directory from template directory,\n"
200 " -x --ephemeral Run container with snapshot of root directory, and\n"
201 " remove it after exit\n"
202 " -i --image=PATH File system device or disk image for the container\n"
203 " -b --boot Boot up full system (i.e. invoke init)\n"
204 " -u --user=USER Run the command under specified user or uid\n"
205 " -M --machine=NAME Set the machine name for the container\n"
206 " --uuid=UUID Set a specific machine UUID for the container\n"
207 " -S --slice=SLICE Place the container in the specified slice\n"
208 " --private-network Disable network in container\n"
209 " --network-interface=INTERFACE\n"
210 " Assign an existing network interface to the\n"
212 " --network-macvlan=INTERFACE\n"
213 " Create a macvlan network interface based on an\n"
214 " existing network interface to the container\n"
215 " --network-ipvlan=INTERFACE\n"
216 " Create a ipvlan network interface based on an\n"
217 " existing network interface to the container\n"
218 " -n --network-veth Add a virtual ethernet connection between host\n"
220 " --network-bridge=INTERFACE\n"
221 " Add a virtual ethernet connection between host\n"
222 " and container and add it to an existing bridge on\n"
224 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225 " Expose a container IP port on the host\n"
226 " -Z --selinux-context=SECLABEL\n"
227 " Set the SELinux security context to be used by\n"
228 " processes in the container\n"
229 " -L --selinux-apifs-context=SECLABEL\n"
230 " Set the SELinux security context to be used by\n"
231 " API/tmpfs file systems in the container\n"
232 " --capability=CAP In addition to the default, retain specified\n"
234 " --drop-capability=CAP Drop the specified capability from the default set\n"
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
238 " --read-only Mount the root directory read-only\n"
239 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
241 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
242 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
244 " --share-system Share system namespaces with host\n"
245 " --register=BOOLEAN Register container as machine\n"
246 " --keep-unit Do not register a scope for the machine, reuse\n"
247 " the service unit nspawn is running in\n"
248 " --volatile[=MODE] Run the system in volatile mode\n"
249 , program_invocation_short_name);
252 static int set_sanitized_path(char **b, const char *path) {
258 p = canonicalize_file_name(path);
263 p = path_make_absolute_cwd(path);
269 *b = path_kill_slashes(p);
273 static int parse_argv(int argc, char *argv[]) {
290 ARG_NETWORK_INTERFACE,
299 static const struct option options[] = {
300 { "help", no_argument, NULL, 'h' },
301 { "version", no_argument, NULL, ARG_VERSION },
302 { "directory", required_argument, NULL, 'D' },
303 { "template", required_argument, NULL, ARG_TEMPLATE },
304 { "ephemeral", no_argument, NULL, 'x' },
305 { "user", required_argument, NULL, 'u' },
306 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
307 { "boot", no_argument, NULL, 'b' },
308 { "uuid", required_argument, NULL, ARG_UUID },
309 { "read-only", no_argument, NULL, ARG_READ_ONLY },
310 { "capability", required_argument, NULL, ARG_CAPABILITY },
311 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
312 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
313 { "bind", required_argument, NULL, ARG_BIND },
314 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
315 { "tmpfs", required_argument, NULL, ARG_TMPFS },
316 { "machine", required_argument, NULL, 'M' },
317 { "slice", required_argument, NULL, 'S' },
318 { "setenv", required_argument, NULL, ARG_SETENV },
319 { "selinux-context", required_argument, NULL, 'Z' },
320 { "selinux-apifs-context", required_argument, NULL, 'L' },
321 { "quiet", no_argument, NULL, 'q' },
322 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
323 { "register", required_argument, NULL, ARG_REGISTER },
324 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
325 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
326 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
327 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
328 { "network-veth", no_argument, NULL, 'n' },
329 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
330 { "personality", required_argument, NULL, ARG_PERSONALITY },
331 { "image", required_argument, NULL, 'i' },
332 { "volatile", optional_argument, NULL, ARG_VOLATILE },
333 { "port", required_argument, NULL, 'p' },
338 uint64_t plus = 0, minus = 0;
343 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
352 puts(PACKAGE_STRING);
353 puts(SYSTEMD_FEATURES);
357 r = set_sanitized_path(&arg_directory, optarg);
359 return log_error_errno(r, "Invalid root directory: %m");
364 r = set_sanitized_path(&arg_template, optarg);
366 return log_error_errno(r, "Invalid template directory: %m");
371 r = set_sanitized_path(&arg_image, optarg);
373 return log_error_errno(r, "Invalid image path: %m");
378 arg_ephemeral = true;
383 arg_user = strdup(optarg);
389 case ARG_NETWORK_BRIDGE:
390 arg_network_bridge = optarg;
395 arg_network_veth = true;
396 arg_private_network = true;
399 case ARG_NETWORK_INTERFACE:
400 if (strv_extend(&arg_network_interfaces, optarg) < 0)
403 arg_private_network = true;
406 case ARG_NETWORK_MACVLAN:
407 if (strv_extend(&arg_network_macvlan, optarg) < 0)
410 arg_private_network = true;
413 case ARG_NETWORK_IPVLAN:
414 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
419 case ARG_PRIVATE_NETWORK:
420 arg_private_network = true;
428 r = sd_id128_from_string(optarg, &arg_uuid);
430 log_error("Invalid UUID: %s", optarg);
440 if (isempty(optarg)) {
444 if (!machine_name_is_valid(optarg)) {
445 log_error("Invalid machine name: %s", optarg);
449 r = free_and_strdup(&arg_machine, optarg);
457 arg_selinux_context = optarg;
461 arg_selinux_apifs_context = optarg;
465 arg_read_only = true;
469 case ARG_DROP_CAPABILITY: {
470 const char *state, *word;
473 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474 _cleanup_free_ char *t;
476 t = strndup(word, length);
480 if (streq(t, "all")) {
481 if (c == ARG_CAPABILITY)
482 plus = (uint64_t) -1;
484 minus = (uint64_t) -1;
488 cap = capability_from_name(t);
490 log_error("Failed to parse capability %s.", t);
494 if (c == ARG_CAPABILITY)
495 plus |= 1ULL << (uint64_t) cap;
497 minus |= 1ULL << (uint64_t) cap;
505 arg_link_journal = LINK_GUEST;
506 arg_link_journal_try = true;
509 case ARG_LINK_JOURNAL:
510 if (streq(optarg, "auto")) {
511 arg_link_journal = LINK_AUTO;
512 arg_link_journal_try = false;
513 } else if (streq(optarg, "no")) {
514 arg_link_journal = LINK_NO;
515 arg_link_journal_try = false;
516 } else if (streq(optarg, "guest")) {
517 arg_link_journal = LINK_GUEST;
518 arg_link_journal_try = false;
519 } else if (streq(optarg, "host")) {
520 arg_link_journal = LINK_HOST;
521 arg_link_journal_try = false;
522 } else if (streq(optarg, "try-guest")) {
523 arg_link_journal = LINK_GUEST;
524 arg_link_journal_try = true;
525 } else if (streq(optarg, "try-host")) {
526 arg_link_journal = LINK_HOST;
527 arg_link_journal_try = true;
529 log_error("Failed to parse link journal mode %s", optarg);
537 _cleanup_free_ char *a = NULL, *b = NULL;
541 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
543 e = strchr(optarg, ':');
545 a = strndup(optarg, e - optarg);
555 if (!path_is_absolute(a) || !path_is_absolute(b)) {
556 log_error("Invalid bind mount specification: %s", optarg);
560 r = strv_extend(x, a);
564 r = strv_extend(x, b);
572 _cleanup_free_ char *a = NULL, *b = NULL;
575 e = strchr(optarg, ':');
577 a = strndup(optarg, e - optarg);
581 b = strdup("mode=0755");
587 if (!path_is_absolute(a)) {
588 log_error("Invalid tmpfs specification: %s", optarg);
592 r = strv_push(&arg_tmpfs, a);
598 r = strv_push(&arg_tmpfs, b);
610 if (!env_assignment_is_valid(optarg)) {
611 log_error("Environment variable assignment '%s' is not valid.", optarg);
615 n = strv_env_set(arg_setenv, optarg);
619 strv_free(arg_setenv);
628 case ARG_SHARE_SYSTEM:
629 arg_share_system = true;
633 r = parse_boolean(optarg);
635 log_error("Failed to parse --register= argument: %s", optarg);
643 arg_keep_unit = true;
646 case ARG_PERSONALITY:
648 arg_personality = personality_from_string(optarg);
649 if (arg_personality == 0xffffffffLU) {
650 log_error("Unknown or unsupported personality '%s'.", optarg);
659 arg_volatile = VOLATILE_YES;
661 r = parse_boolean(optarg);
663 if (streq(optarg, "state"))
664 arg_volatile = VOLATILE_STATE;
666 log_error("Failed to parse --volatile= argument: %s", optarg);
670 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
676 const char *split, *e;
677 uint16_t container_port, host_port;
681 if ((e = startswith(optarg, "tcp:")))
682 protocol = IPPROTO_TCP;
683 else if ((e = startswith(optarg, "udp:")))
684 protocol = IPPROTO_UDP;
687 protocol = IPPROTO_TCP;
690 split = strchr(e, ':');
692 char v[split - e + 1];
694 memcpy(v, e, split - e);
697 r = safe_atou16(v, &host_port);
698 if (r < 0 || host_port <= 0) {
699 log_error("Failed to parse host port: %s", optarg);
703 r = safe_atou16(split + 1, &container_port);
705 r = safe_atou16(e, &container_port);
706 host_port = container_port;
709 if (r < 0 || container_port <= 0) {
710 log_error("Failed to parse host port: %s", optarg);
714 LIST_FOREACH(ports, p, arg_expose_ports) {
715 if (p->protocol == protocol && p->host_port == host_port) {
716 log_error("Duplicate port specification: %s", optarg);
721 p = new(ExposePort, 1);
725 p->protocol = protocol;
726 p->host_port = host_port;
727 p->container_port = container_port;
729 LIST_PREPEND(ports, arg_expose_ports, p);
738 assert_not_reached("Unhandled option");
741 if (arg_share_system)
742 arg_register = false;
744 if (arg_boot && arg_share_system) {
745 log_error("--boot and --share-system may not be combined.");
749 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750 log_error("--keep-unit may not be used when invoked from a user session.");
754 if (arg_directory && arg_image) {
755 log_error("--directory= and --image= may not be combined.");
759 if (arg_template && arg_image) {
760 log_error("--template= and --image= may not be combined.");
764 if (arg_template && !(arg_directory || arg_machine)) {
765 log_error("--template= needs --directory= or --machine=.");
769 if (arg_ephemeral && arg_template) {
770 log_error("--ephemeral and --template= may not be combined.");
774 if (arg_ephemeral && arg_image) {
775 log_error("--ephemeral and --image= may not be combined.");
779 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780 log_error("--ephemeral and --link-journal= may not be combined.");
784 if (arg_volatile != VOLATILE_NO && arg_read_only) {
785 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
789 if (arg_expose_ports && !arg_private_network) {
790 log_error("Cannot use --port= without private networking.");
794 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
799 static int mount_all(const char *dest) {
801 typedef struct MountPoint {
810 static const MountPoint mount_table[] = {
811 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
812 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
813 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
814 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
815 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
816 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
818 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
820 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
821 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
828 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
829 _cleanup_free_ char *where = NULL;
831 _cleanup_free_ char *options = NULL;
836 where = strjoin(dest, "/", mount_table[k].where, NULL);
840 t = path_is_mount_point(where, true);
842 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
850 /* Skip this entry if it is not a remount. */
851 if (mount_table[k].what && t > 0)
854 t = mkdir_p(where, 0755);
856 if (mount_table[k].fatal) {
857 log_error_errno(t, "Failed to create directory %s: %m", where);
862 log_warning_errno(t, "Failed to create directory %s: %m", where);
868 if (arg_selinux_apifs_context &&
869 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
870 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
877 o = mount_table[k].options;
880 if (mount(mount_table[k].what,
883 mount_table[k].flags,
886 if (mount_table[k].fatal) {
887 log_error_errno(errno, "mount(%s) failed: %m", where);
892 log_warning_errno(errno, "mount(%s) failed: %m", where);
899 static int mount_binds(const char *dest, char **l, bool ro) {
902 STRV_FOREACH_PAIR(x, y, l) {
903 _cleanup_free_ char *where = NULL;
904 struct stat source_st, dest_st;
907 if (stat(*x, &source_st) < 0)
908 return log_error_errno(errno, "Failed to stat %s: %m", *x);
910 where = strappend(dest, *y);
914 r = stat(where, &dest_st);
916 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
917 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
920 } else if (errno == ENOENT) {
921 r = mkdir_parents_label(where, 0755);
923 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
925 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
929 /* Create the mount point, but be conservative -- refuse to create block
930 * and char devices. */
931 if (S_ISDIR(source_st.st_mode)) {
932 r = mkdir_label(where, 0755);
933 if (r < 0 && errno != EEXIST)
934 return log_error_errno(r, "Failed to create mount point %s: %m", where);
935 } else if (S_ISFIFO(source_st.st_mode)) {
936 r = mkfifo(where, 0644);
937 if (r < 0 && errno != EEXIST)
938 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
939 } else if (S_ISSOCK(source_st.st_mode)) {
940 r = mknod(where, 0644 | S_IFSOCK, 0);
941 if (r < 0 && errno != EEXIST)
942 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
943 } else if (S_ISREG(source_st.st_mode)) {
946 return log_error_errno(r, "Failed to create mount point %s: %m", where);
948 log_error("Refusing to create mountpoint for file: %s", *x);
952 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
953 return log_error_errno(errno, "mount(%s) failed: %m", where);
956 r = bind_remount_recursive(where, true);
958 return log_error_errno(r, "Read-Only bind mount failed: %m");
965 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
969 to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
971 r = path_is_mount_point(to, false);
973 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
979 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
980 return log_error_errno(errno, "Failed to mount to %s: %m", to);
985 static int mount_cgroup(const char *dest) {
986 _cleanup_set_free_free_ Set *controllers = NULL;
987 _cleanup_free_ char *own_cgroup_path = NULL;
988 const char *cgroup_root, *systemd_root, *systemd_own;
991 controllers = set_new(&string_hash_ops);
995 r = cg_kernel_controllers(controllers);
997 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
999 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1001 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1003 cgroup_root = strappenda(dest, "/sys/fs/cgroup");
1004 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1005 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1008 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1010 controller = set_steal_first(controllers);
1014 origin = strappend("/sys/fs/cgroup/", controller);
1018 r = readlink_malloc(origin, &combined);
1020 /* Not a symbolic link, but directly a single cgroup hierarchy */
1022 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1027 return log_error_errno(r, "Failed to read link %s: %m", origin);
1029 _cleanup_free_ char *target = NULL;
1031 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1035 /* A symbolic link, a combination of controllers in one hierarchy */
1037 if (!filename_is_valid(combined)) {
1038 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1042 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1046 if (symlink(combined, target) < 0)
1047 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1051 r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1055 /* Make our own cgroup a (writable) bind mount */
1056 systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1057 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1058 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1060 /* And then remount the systemd cgroup root read-only */
1061 systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1062 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1063 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1065 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1066 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1071 static int mount_tmpfs(const char *dest) {
1074 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1075 _cleanup_free_ char *where = NULL;
1078 where = strappend(dest, *i);
1082 r = mkdir_label(where, 0755);
1083 if (r < 0 && r != -EEXIST)
1084 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1086 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1087 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1093 static int setup_timezone(const char *dest) {
1094 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1100 /* Fix the timezone, if possible */
1101 r = readlink_malloc("/etc/localtime", &p);
1103 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1107 z = path_startswith(p, "../usr/share/zoneinfo/");
1109 z = path_startswith(p, "/usr/share/zoneinfo/");
1111 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1115 where = strappend(dest, "/etc/localtime");
1119 r = readlink_malloc(where, &q);
1121 y = path_startswith(q, "../usr/share/zoneinfo/");
1123 y = path_startswith(q, "/usr/share/zoneinfo/");
1125 /* Already pointing to the right place? Then do nothing .. */
1126 if (y && streq(y, z))
1130 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1134 if (access(check, F_OK) < 0) {
1135 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1139 what = strappend("../usr/share/zoneinfo/", z);
1143 r = mkdir_parents(where, 0755);
1145 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1151 if (r < 0 && errno != ENOENT) {
1152 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1157 if (symlink(what, where) < 0) {
1158 log_error_errno(errno, "Failed to correct timezone of container: %m");
1165 static int setup_resolv_conf(const char *dest) {
1166 _cleanup_free_ char *where = NULL;
1171 if (arg_private_network)
1174 /* Fix resolv.conf, if possible */
1175 where = strappend(dest, "/etc/resolv.conf");
1179 /* We don't really care for the results of this really. If it
1180 * fails, it fails, but meh... */
1181 r = mkdir_parents(where, 0755);
1183 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1188 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1190 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1198 static int setup_volatile_state(const char *directory) {
1204 if (arg_volatile != VOLATILE_STATE)
1207 /* --volatile=state means we simply overmount /var
1208 with a tmpfs, and the rest read-only. */
1210 r = bind_remount_recursive(directory, true);
1212 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1214 p = strappenda(directory, "/var");
1216 if (r < 0 && errno != EEXIST)
1217 return log_error_errno(errno, "Failed to create %s: %m", directory);
1219 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1220 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1225 static int setup_volatile(const char *directory) {
1226 bool tmpfs_mounted = false, bind_mounted = false;
1227 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1233 if (arg_volatile != VOLATILE_YES)
1236 /* --volatile=yes means we mount a tmpfs to the root dir, and
1237 the original /usr to use inside it, and that read-only. */
1239 if (!mkdtemp(template))
1240 return log_error_errno(errno, "Failed to create temporary directory: %m");
1242 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1243 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1248 tmpfs_mounted = true;
1250 f = strappenda(directory, "/usr");
1251 t = strappenda(template, "/usr");
1254 if (r < 0 && errno != EEXIST) {
1255 log_error_errno(errno, "Failed to create %s: %m", t);
1260 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1261 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1266 bind_mounted = true;
1268 r = bind_remount_recursive(t, true);
1270 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1274 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1275 log_error_errno(errno, "Failed to move root mount: %m");
1293 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1296 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1297 SD_ID128_FORMAT_VAL(id));
1302 static int setup_boot_id(const char *dest) {
1303 _cleanup_free_ char *from = NULL, *to = NULL;
1304 sd_id128_t rnd = {};
1310 if (arg_share_system)
1313 /* Generate a new randomized boot ID, so that each boot-up of
1314 * the container gets a new one */
1316 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1317 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1321 r = sd_id128_randomize(&rnd);
1323 return log_error_errno(r, "Failed to generate random boot id: %m");
1325 id128_format_as_uuid(rnd, as_uuid);
1327 r = write_string_file(from, as_uuid);
1329 return log_error_errno(r, "Failed to write boot id: %m");
1331 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1332 log_error_errno(errno, "Failed to bind mount boot id: %m");
1334 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1335 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1341 static int copy_devnodes(const char *dest) {
1343 static const char devnodes[] =
1354 _cleanup_umask_ mode_t u;
1360 NULSTR_FOREACH(d, devnodes) {
1361 _cleanup_free_ char *from = NULL, *to = NULL;
1364 from = strappend("/dev/", d);
1365 to = strjoin(dest, "/dev/", d, NULL);
1369 if (stat(from, &st) < 0) {
1371 if (errno != ENOENT)
1372 return log_error_errno(errno, "Failed to stat %s: %m", from);
1374 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1376 log_error("%s is not a char or block device, cannot copy", from);
1380 r = mkdir_parents(to, 0775);
1382 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1386 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1387 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1394 static int setup_ptmx(const char *dest) {
1395 _cleanup_free_ char *p = NULL;
1397 p = strappend(dest, "/dev/ptmx");
1401 if (symlink("pts/ptmx", p) < 0)
1402 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1407 static int setup_dev_console(const char *dest, const char *console) {
1408 _cleanup_umask_ mode_t u;
1418 if (stat("/dev/null", &st) < 0)
1419 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1421 r = chmod_and_chown(console, 0600, 0, 0);
1423 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1425 /* We need to bind mount the right tty to /dev/console since
1426 * ptys can only exist on pts file systems. To have something
1427 * to bind mount things on we create a device node first, and
1428 * use /dev/null for that since we the cgroups device policy
1429 * allows us to create that freely, while we cannot create
1430 * /dev/console. (Note that the major minor doesn't actually
1431 * matter here, since we mount it over anyway). */
1433 to = strappenda(dest, "/dev/console");
1434 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1435 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1437 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1438 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1443 static int setup_kmsg(const char *dest, int kmsg_socket) {
1444 _cleanup_free_ char *from = NULL, *to = NULL;
1445 _cleanup_umask_ mode_t u;
1448 struct cmsghdr cmsghdr;
1449 uint8_t buf[CMSG_SPACE(sizeof(int))];
1451 struct msghdr mh = {
1452 .msg_control = &control,
1453 .msg_controllen = sizeof(control),
1455 struct cmsghdr *cmsg;
1458 assert(kmsg_socket >= 0);
1462 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1463 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1464 * on the reading side behave very similar to /proc/kmsg,
1465 * their writing side behaves differently from /dev/kmsg in
1466 * that writing blocks when nothing is reading. In order to
1467 * avoid any problems with containers deadlocking due to this
1468 * we simply make /dev/kmsg unavailable to the container. */
1469 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1470 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1473 if (mkfifo(from, 0600) < 0)
1474 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1476 r = chmod_and_chown(from, 0600, 0, 0);
1478 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1480 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1481 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1483 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1485 return log_error_errno(errno, "Failed to open fifo: %m");
1487 cmsg = CMSG_FIRSTHDR(&mh);
1488 cmsg->cmsg_level = SOL_SOCKET;
1489 cmsg->cmsg_type = SCM_RIGHTS;
1490 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1491 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1493 mh.msg_controllen = cmsg->cmsg_len;
1495 /* Store away the fd in the socket, so that it stays open as
1496 * long as we run the child */
1497 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1501 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1503 /* And now make the FIFO unavailable as /dev/kmsg... */
1508 static int send_rtnl(int send_fd) {
1510 struct cmsghdr cmsghdr;
1511 uint8_t buf[CMSG_SPACE(sizeof(int))];
1513 struct msghdr mh = {
1514 .msg_control = &control,
1515 .msg_controllen = sizeof(control),
1517 struct cmsghdr *cmsg;
1518 _cleanup_close_ int fd = -1;
1521 assert(send_fd >= 0);
1523 if (!arg_expose_ports)
1526 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1528 return log_error_errno(errno, "failed to allocate container netlink: %m");
1530 cmsg = CMSG_FIRSTHDR(&mh);
1531 cmsg->cmsg_level = SOL_SOCKET;
1532 cmsg->cmsg_type = SCM_RIGHTS;
1533 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1534 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1536 mh.msg_controllen = cmsg->cmsg_len;
1538 /* Store away the fd in the socket, so that it stays open as
1539 * long as we run the child */
1540 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1542 return log_error_errno(errno, "Failed to send netlink fd: %m");
1547 static int flush_ports(union in_addr_union *exposed) {
1549 int r, af = AF_INET;
1553 if (!arg_expose_ports)
1556 if (in_addr_is_null(af, exposed))
1559 log_debug("Lost IP address.");
1561 LIST_FOREACH(ports, p, arg_expose_ports) {
1562 r = fw_add_local_dnat(false,
1573 log_warning_errno(r, "Failed to modify firewall: %m");
1576 *exposed = IN_ADDR_NULL;
1580 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1581 _cleanup_free_ struct local_address *addresses = NULL;
1582 _cleanup_free_ char *pretty = NULL;
1583 union in_addr_union new_exposed;
1586 int af = AF_INET, r;
1590 /* Invoked each time an address is added or removed inside the
1593 if (!arg_expose_ports)
1596 r = local_addresses(rtnl, 0, af, &addresses);
1598 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1601 addresses[0].family == af &&
1602 addresses[0].scope < RT_SCOPE_LINK;
1605 return flush_ports(exposed);
1607 new_exposed = addresses[0].address;
1608 if (in_addr_equal(af, exposed, &new_exposed))
1611 in_addr_to_string(af, &new_exposed, &pretty);
1612 log_debug("New container IP is %s.", strna(pretty));
1614 LIST_FOREACH(ports, p, arg_expose_ports) {
1616 r = fw_add_local_dnat(true,
1625 in_addr_is_null(af, exposed) ? NULL : exposed);
1627 log_warning_errno(r, "Failed to modify firewall: %m");
1630 *exposed = new_exposed;
1634 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1635 union in_addr_union *exposed = userdata;
1641 expose_ports(rtnl, exposed);
1645 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1647 struct cmsghdr cmsghdr;
1648 uint8_t buf[CMSG_SPACE(sizeof(int))];
1650 struct msghdr mh = {
1651 .msg_control = &control,
1652 .msg_controllen = sizeof(control),
1654 struct cmsghdr *cmsg;
1655 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1660 assert(recv_fd >= 0);
1663 if (!arg_expose_ports)
1666 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1668 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1670 cmsg = CMSG_FIRSTHDR(&mh);
1671 assert(cmsg->cmsg_level == SOL_SOCKET);
1672 assert(cmsg->cmsg_type == SCM_RIGHTS);
1673 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1674 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1676 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1679 return log_error_errno(r, "Failed to create rtnl object: %m");
1682 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1684 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1686 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1688 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1690 r = sd_rtnl_attach_event(rtnl, event, 0);
1692 return log_error_errno(r, "Failed to add to even loop: %m");
1700 static int setup_hostname(void) {
1702 if (arg_share_system)
1705 if (sethostname_idempotent(arg_machine) < 0)
1711 static int setup_journal(const char *directory) {
1712 sd_id128_t machine_id, this_id;
1713 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1717 /* Don't link journals in ephemeral mode */
1721 p = strappend(directory, "/etc/machine-id");
1725 r = read_one_line_file(p, &b);
1726 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1729 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1732 if (isempty(id) && arg_link_journal == LINK_AUTO)
1735 /* Verify validity */
1736 r = sd_id128_from_string(id, &machine_id);
1738 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1740 r = sd_id128_get_machine(&this_id);
1742 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1744 if (sd_id128_equal(machine_id, this_id)) {
1745 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1746 "Host and machine ids are equal (%s): refusing to link journals", id);
1747 if (arg_link_journal == LINK_AUTO)
1752 if (arg_link_journal == LINK_NO)
1756 p = strappend("/var/log/journal/", id);
1757 q = strjoin(directory, "/var/log/journal/", id, NULL);
1761 if (path_is_mount_point(p, false) > 0) {
1762 if (arg_link_journal != LINK_AUTO) {
1763 log_error("%s: already a mount point, refusing to use for journal", p);
1770 if (path_is_mount_point(q, false) > 0) {
1771 if (arg_link_journal != LINK_AUTO) {
1772 log_error("%s: already a mount point, refusing to use for journal", q);
1779 r = readlink_and_make_absolute(p, &d);
1781 if ((arg_link_journal == LINK_GUEST ||
1782 arg_link_journal == LINK_AUTO) &&
1785 r = mkdir_p(q, 0755);
1787 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1792 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1793 } else if (r == -EINVAL) {
1795 if (arg_link_journal == LINK_GUEST &&
1798 if (errno == ENOTDIR) {
1799 log_error("%s already exists and is neither a symlink nor a directory", p);
1802 log_error_errno(errno, "Failed to remove %s: %m", p);
1806 } else if (r != -ENOENT) {
1807 log_error_errno(errno, "readlink(%s) failed: %m", p);
1811 if (arg_link_journal == LINK_GUEST) {
1813 if (symlink(q, p) < 0) {
1814 if (arg_link_journal_try) {
1815 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1818 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1823 r = mkdir_p(q, 0755);
1825 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1829 if (arg_link_journal == LINK_HOST) {
1830 /* don't create parents here -- if the host doesn't have
1831 * permanent journal set up, don't force it here */
1834 if (arg_link_journal_try) {
1835 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1838 log_error_errno(errno, "Failed to create %s: %m", p);
1843 } else if (access(p, F_OK) < 0)
1846 if (dir_is_empty(q) == 0)
1847 log_warning("%s is not empty, proceeding anyway.", q);
1849 r = mkdir_p(q, 0755);
1851 log_error_errno(errno, "Failed to create %s: %m", q);
1855 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1856 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1861 static int drop_capabilities(void) {
1862 return capability_bounding_set_drop(~arg_retain, false);
1865 static int register_machine(pid_t pid, int local_ifindex) {
1866 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1867 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1873 r = sd_bus_default_system(&bus);
1875 return log_error_errno(r, "Failed to open system bus: %m");
1877 if (arg_keep_unit) {
1878 r = sd_bus_call_method(
1880 "org.freedesktop.machine1",
1881 "/org/freedesktop/machine1",
1882 "org.freedesktop.machine1.Manager",
1883 "RegisterMachineWithNetwork",
1888 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1892 strempty(arg_directory),
1893 local_ifindex > 0 ? 1 : 0, local_ifindex);
1895 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1897 r = sd_bus_message_new_method_call(
1900 "org.freedesktop.machine1",
1901 "/org/freedesktop/machine1",
1902 "org.freedesktop.machine1.Manager",
1903 "CreateMachineWithNetwork");
1905 return log_error_errno(r, "Failed to create message: %m");
1907 r = sd_bus_message_append(
1911 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1915 strempty(arg_directory),
1916 local_ifindex > 0 ? 1 : 0, local_ifindex);
1918 return log_error_errno(r, "Failed to append message arguments: %m");
1920 r = sd_bus_message_open_container(m, 'a', "(sv)");
1922 return log_error_errno(r, "Failed to open container: %m");
1924 if (!isempty(arg_slice)) {
1925 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1927 return log_error_errno(r, "Failed to append slice: %m");
1930 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1932 return log_error_errno(r, "Failed to add device policy: %m");
1934 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1935 /* Allow the container to
1936 * access and create the API
1937 * device nodes, so that
1938 * PrivateDevices= in the
1939 * container can work
1944 "/dev/random", "rwm",
1945 "/dev/urandom", "rwm",
1947 "/dev/net/tun", "rwm",
1948 /* Allow the container
1949 * access to ptys. However,
1951 * container to ever create
1952 * these device nodes. */
1953 "/dev/pts/ptmx", "rw",
1956 return log_error_errno(r, "Failed to add device whitelist: %m");
1958 r = sd_bus_message_close_container(m);
1960 return log_error_errno(r, "Failed to close container: %m");
1962 r = sd_bus_call(bus, m, 0, &error, NULL);
1966 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1973 static int terminate_machine(pid_t pid) {
1974 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1975 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1976 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1983 r = sd_bus_default_system(&bus);
1985 return log_error_errno(r, "Failed to open system bus: %m");
1987 r = sd_bus_call_method(
1989 "org.freedesktop.machine1",
1990 "/org/freedesktop/machine1",
1991 "org.freedesktop.machine1.Manager",
1998 /* Note that the machine might already have been
1999 * cleaned up automatically, hence don't consider it a
2000 * failure if we cannot get the machine object. */
2001 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2005 r = sd_bus_message_read(reply, "o", &path);
2007 return bus_log_parse_error(r);
2009 r = sd_bus_call_method(
2011 "org.freedesktop.machine1",
2013 "org.freedesktop.machine1.Machine",
2019 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2026 static int reset_audit_loginuid(void) {
2027 _cleanup_free_ char *p = NULL;
2030 if (arg_share_system)
2033 r = read_one_line_file("/proc/self/loginuid", &p);
2037 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2039 /* Already reset? */
2040 if (streq(p, "4294967295"))
2043 r = write_string_file("/proc/self/loginuid", "4294967295");
2045 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2046 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2047 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2048 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2049 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2057 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2058 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2059 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2061 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2067 l = strlen(arg_machine);
2068 sz = sizeof(sd_id128_t) + l;
2074 /* fetch some persistent data unique to the host */
2075 r = sd_id128_get_machine((sd_id128_t*) v);
2079 /* combine with some data unique (on this host) to this
2080 * container instance */
2081 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2084 memcpy(i, &idx, sizeof(idx));
2087 /* Let's hash the host machine ID plus the container name. We
2088 * use a fixed, but originally randomly created hash key here. */
2089 siphash24(result, v, sz, hash_key.bytes);
2091 assert_cc(ETH_ALEN <= sizeof(result));
2092 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2094 /* see eth_random_addr in the kernel */
2095 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2096 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2101 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2102 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2103 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2104 struct ether_addr mac_host, mac_container;
2107 if (!arg_private_network)
2110 if (!arg_network_veth)
2113 /* Use two different interface name prefixes depending whether
2114 * we are in bridge mode or not. */
2115 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2116 arg_network_bridge ? "vb" : "ve", arg_machine);
2118 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2120 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2122 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2124 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2126 r = sd_rtnl_open(&rtnl, 0);
2128 return log_error_errno(r, "Failed to connect to netlink: %m");
2130 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2132 return log_error_errno(r, "Failed to allocate netlink message: %m");
2134 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2136 return log_error_errno(r, "Failed to add netlink interface name: %m");
2138 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2140 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2142 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2144 return log_error_errno(r, "Failed to open netlink container: %m");
2146 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2148 return log_error_errno(r, "Failed to open netlink container: %m");
2150 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2152 return log_error_errno(r, "Failed to open netlink container: %m");
2154 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2156 return log_error_errno(r, "Failed to add netlink interface name: %m");
2158 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2160 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2162 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2164 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2166 r = sd_rtnl_message_close_container(m);
2168 return log_error_errno(r, "Failed to close netlink container: %m");
2170 r = sd_rtnl_message_close_container(m);
2172 return log_error_errno(r, "Failed to close netlink container: %m");
2174 r = sd_rtnl_message_close_container(m);
2176 return log_error_errno(r, "Failed to close netlink container: %m");
2178 r = sd_rtnl_call(rtnl, m, 0, NULL);
2180 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2182 i = (int) if_nametoindex(iface_name);
2184 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2191 static int setup_bridge(const char veth_name[], int *ifi) {
2192 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2193 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2196 if (!arg_private_network)
2199 if (!arg_network_veth)
2202 if (!arg_network_bridge)
2205 bridge = (int) if_nametoindex(arg_network_bridge);
2207 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2211 r = sd_rtnl_open(&rtnl, 0);
2213 return log_error_errno(r, "Failed to connect to netlink: %m");
2215 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2217 return log_error_errno(r, "Failed to allocate netlink message: %m");
2219 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2221 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2223 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2225 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2227 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2229 return log_error_errno(r, "Failed to add netlink master field: %m");
2231 r = sd_rtnl_call(rtnl, m, 0, NULL);
2233 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2238 static int parse_interface(struct udev *udev, const char *name) {
2239 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2240 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2243 ifi = (int) if_nametoindex(name);
2245 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2247 sprintf(ifi_str, "n%i", ifi);
2248 d = udev_device_new_from_device_id(udev, ifi_str);
2250 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2252 if (udev_device_get_is_initialized(d) <= 0) {
2253 log_error("Network interface %s is not initialized yet.", name);
2260 static int move_network_interfaces(pid_t pid) {
2261 _cleanup_udev_unref_ struct udev *udev = NULL;
2262 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2266 if (!arg_private_network)
2269 if (strv_isempty(arg_network_interfaces))
2272 r = sd_rtnl_open(&rtnl, 0);
2274 return log_error_errno(r, "Failed to connect to netlink: %m");
2278 log_error("Failed to connect to udev.");
2282 STRV_FOREACH(i, arg_network_interfaces) {
2283 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2286 ifi = parse_interface(udev, *i);
2290 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2292 return log_error_errno(r, "Failed to allocate netlink message: %m");
2294 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2296 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2298 r = sd_rtnl_call(rtnl, m, 0, NULL);
2300 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2306 static int setup_macvlan(pid_t pid) {
2307 _cleanup_udev_unref_ struct udev *udev = NULL;
2308 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2313 if (!arg_private_network)
2316 if (strv_isempty(arg_network_macvlan))
2319 r = sd_rtnl_open(&rtnl, 0);
2321 return log_error_errno(r, "Failed to connect to netlink: %m");
2325 log_error("Failed to connect to udev.");
2329 STRV_FOREACH(i, arg_network_macvlan) {
2330 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2331 _cleanup_free_ char *n = NULL;
2332 struct ether_addr mac;
2335 ifi = parse_interface(udev, *i);
2339 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2341 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2343 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2345 return log_error_errno(r, "Failed to allocate netlink message: %m");
2347 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2349 return log_error_errno(r, "Failed to add netlink interface index: %m");
2351 n = strappend("mv-", *i);
2355 strshorten(n, IFNAMSIZ-1);
2357 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2359 return log_error_errno(r, "Failed to add netlink interface name: %m");
2361 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2363 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2365 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2367 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2369 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2371 return log_error_errno(r, "Failed to open netlink container: %m");
2373 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2375 return log_error_errno(r, "Failed to open netlink container: %m");
2377 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2379 return log_error_errno(r, "Failed to append macvlan mode: %m");
2381 r = sd_rtnl_message_close_container(m);
2383 return log_error_errno(r, "Failed to close netlink container: %m");
2385 r = sd_rtnl_message_close_container(m);
2387 return log_error_errno(r, "Failed to close netlink container: %m");
2389 r = sd_rtnl_call(rtnl, m, 0, NULL);
2391 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2397 static int setup_ipvlan(pid_t pid) {
2398 _cleanup_udev_unref_ struct udev *udev = NULL;
2399 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2403 if (!arg_private_network)
2406 if (strv_isempty(arg_network_ipvlan))
2409 r = sd_rtnl_open(&rtnl, 0);
2411 return log_error_errno(r, "Failed to connect to netlink: %m");
2415 log_error("Failed to connect to udev.");
2419 STRV_FOREACH(i, arg_network_ipvlan) {
2420 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2421 _cleanup_free_ char *n = NULL;
2424 ifi = parse_interface(udev, *i);
2428 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2430 return log_error_errno(r, "Failed to allocate netlink message: %m");
2432 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2434 return log_error_errno(r, "Failed to add netlink interface index: %m");
2436 n = strappend("iv-", *i);
2440 strshorten(n, IFNAMSIZ-1);
2442 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2444 return log_error_errno(r, "Failed to add netlink interface name: %m");
2446 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2448 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2450 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2452 return log_error_errno(r, "Failed to open netlink container: %m");
2454 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2456 return log_error_errno(r, "Failed to open netlink container: %m");
2458 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2460 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2462 r = sd_rtnl_message_close_container(m);
2464 return log_error_errno(r, "Failed to close netlink container: %m");
2466 r = sd_rtnl_message_close_container(m);
2468 return log_error_errno(r, "Failed to close netlink container: %m");
2470 r = sd_rtnl_call(rtnl, m, 0, NULL);
2472 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2478 static int setup_seccomp(void) {
2481 static const int blacklist[] = {
2482 SCMP_SYS(kexec_load),
2483 SCMP_SYS(open_by_handle_at),
2484 SCMP_SYS(init_module),
2485 SCMP_SYS(finit_module),
2486 SCMP_SYS(delete_module),
2493 scmp_filter_ctx seccomp;
2497 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2501 r = seccomp_add_secondary_archs(seccomp);
2503 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2507 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2508 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2510 continue; /* unknown syscall */
2512 log_error_errno(r, "Failed to block syscall: %m");
2518 Audit is broken in containers, much of the userspace audit
2519 hookup will fail if running inside a container. We don't
2520 care and just turn off creation of audit sockets.
2522 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2523 with EAFNOSUPPORT which audit userspace uses as indication
2524 that audit is disabled in the kernel.
2527 r = seccomp_rule_add(
2529 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2532 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2533 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2535 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2539 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2541 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2545 r = seccomp_load(seccomp);
2547 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2550 seccomp_release(seccomp);
2558 static int setup_propagate(const char *root) {
2561 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2562 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2563 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2564 (void) mkdir_p(p, 0600);
2566 q = strappenda(root, "/run/systemd/nspawn/incoming");
2567 mkdir_parents(q, 0755);
2570 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2571 return log_error_errno(errno, "Failed to install propagation bind mount.");
2573 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2574 return log_error_errno(errno, "Failed to make propagation mount read-only");
2579 static int setup_image(char **device_path, int *loop_nr) {
2580 struct loop_info64 info = {
2581 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2583 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2584 _cleanup_free_ char* loopdev = NULL;
2588 assert(device_path);
2592 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2594 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2596 if (fstat(fd, &st) < 0)
2597 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2599 if (S_ISBLK(st.st_mode)) {
2602 p = strdup(arg_image);
2616 if (!S_ISREG(st.st_mode)) {
2617 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2621 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2623 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2625 nr = ioctl(control, LOOP_CTL_GET_FREE);
2627 return log_error_errno(errno, "Failed to allocate loop device: %m");
2629 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2632 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2634 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2636 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2637 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2640 info.lo_flags |= LO_FLAGS_READ_ONLY;
2642 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2643 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2645 *device_path = loopdev;
2656 #define PARTITION_TABLE_BLURB \
2657 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2658 "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2659 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2660 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2661 "to be bootable with systemd-nspawn."
2663 static int dissect_image(
2665 char **root_device, bool *root_device_rw,
2666 char **home_device, bool *home_device_rw,
2667 char **srv_device, bool *srv_device_rw,
2671 int home_nr = -1, srv_nr = -1;
2672 #ifdef GPT_ROOT_NATIVE
2675 #ifdef GPT_ROOT_SECONDARY
2676 int secondary_root_nr = -1;
2678 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2679 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2680 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2681 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2682 _cleanup_udev_unref_ struct udev *udev = NULL;
2683 struct udev_list_entry *first, *item;
2684 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2685 bool is_gpt, is_mbr, multiple_generic = false;
2686 const char *pttype = NULL;
2693 assert(root_device);
2694 assert(home_device);
2699 b = blkid_new_probe();
2704 r = blkid_probe_set_device(b, fd, 0, 0);
2709 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2713 blkid_probe_enable_partitions(b, 1);
2714 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2717 r = blkid_do_safeprobe(b);
2718 if (r == -2 || r == 1) {
2719 log_error("Failed to identify any partition table on\n"
2721 PARTITION_TABLE_BLURB, arg_image);
2723 } else if (r != 0) {
2726 log_error_errno(errno, "Failed to probe: %m");
2730 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2732 is_gpt = streq_ptr(pttype, "gpt");
2733 is_mbr = streq_ptr(pttype, "dos");
2735 if (!is_gpt && !is_mbr) {
2736 log_error("No GPT or MBR partition table discovered on\n"
2738 PARTITION_TABLE_BLURB, arg_image);
2743 pl = blkid_probe_get_partitions(b);
2748 log_error("Failed to list partitions of %s", arg_image);
2756 if (fstat(fd, &st) < 0)
2757 return log_error_errno(errno, "Failed to stat block device: %m");
2759 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2767 log_error("Kernel partitions never appeared.");
2771 e = udev_enumerate_new(udev);
2775 r = udev_enumerate_add_match_parent(e, d);
2779 r = udev_enumerate_scan_devices(e);
2781 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2783 /* Count the partitions enumerated by the kernel */
2785 first = udev_enumerate_get_list_entry(e);
2786 udev_list_entry_foreach(item, first)
2789 /* Count the partitions enumerated by blkid */
2790 m = blkid_partlist_numof_partitions(pl);
2794 log_error("blkid and kernel partition list do not match.");
2800 /* The kernel has probed fewer partitions than
2801 * blkid? Maybe the kernel prober is still
2802 * running or it got EBUSY because udev
2803 * already opened the device. Let's reprobe
2804 * the device, which is a synchronous call
2805 * that waits until probing is complete. */
2807 for (j = 0; j < 20; j++) {
2809 r = ioctl(fd, BLKRRPART, 0);
2812 if (r >= 0 || r != -EBUSY)
2815 /* If something else has the device
2816 * open, such as an udev rule, the
2817 * ioctl will return EBUSY. Since
2818 * there's no way to wait until it
2819 * isn't busy anymore, let's just wait
2820 * a bit, and try again.
2822 * This is really something they
2823 * should fix in the kernel! */
2825 usleep(50 * USEC_PER_MSEC);
2829 return log_error_errno(r, "Failed to reread partition table: %m");
2832 e = udev_enumerate_unref(e);
2835 first = udev_enumerate_get_list_entry(e);
2836 udev_list_entry_foreach(item, first) {
2837 _cleanup_udev_device_unref_ struct udev_device *q;
2839 unsigned long long flags;
2845 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2850 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2854 qn = udev_device_get_devnum(q);
2858 if (st.st_rdev == qn)
2861 node = udev_device_get_devnode(q);
2865 pp = blkid_partlist_devno_to_partition(pl, qn);
2869 flags = blkid_partition_get_flags(pp);
2871 nr = blkid_partition_get_partno(pp);
2879 if (flags & GPT_FLAG_NO_AUTO)
2882 stype = blkid_partition_get_type_string(pp);
2886 if (sd_id128_from_string(stype, &type_id) < 0)
2889 if (sd_id128_equal(type_id, GPT_HOME)) {
2891 if (home && nr >= home_nr)
2895 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2897 r = free_and_strdup(&home, node);
2901 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2903 if (srv && nr >= srv_nr)
2907 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2909 r = free_and_strdup(&srv, node);
2913 #ifdef GPT_ROOT_NATIVE
2914 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2916 if (root && nr >= root_nr)
2920 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2922 r = free_and_strdup(&root, node);
2927 #ifdef GPT_ROOT_SECONDARY
2928 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2930 if (secondary_root && nr >= secondary_root_nr)
2933 secondary_root_nr = nr;
2934 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2936 r = free_and_strdup(&secondary_root, node);
2941 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2944 multiple_generic = true;
2946 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2948 r = free_and_strdup(&generic, node);
2954 } else if (is_mbr) {
2957 if (flags != 0x80) /* Bootable flag */
2960 type = blkid_partition_get_type(pp);
2961 if (type != 0x83) /* Linux partition */
2965 multiple_generic = true;
2969 r = free_and_strdup(&root, node);
2977 *root_device = root;
2980 *root_device_rw = root_rw;
2982 } else if (secondary_root) {
2983 *root_device = secondary_root;
2984 secondary_root = NULL;
2986 *root_device_rw = secondary_root_rw;
2988 } else if (generic) {
2990 /* There were no partitions with precise meanings
2991 * around, but we found generic partitions. In this
2992 * case, if there's only one, we can go ahead and boot
2993 * it, otherwise we bail out, because we really cannot
2994 * make any sense of it. */
2996 if (multiple_generic) {
2997 log_error("Identified multiple bootable Linux partitions on\n"
2999 PARTITION_TABLE_BLURB, arg_image);
3003 *root_device = generic;
3006 *root_device_rw = generic_rw;
3009 log_error("Failed to identify root partition in disk image\n"
3011 PARTITION_TABLE_BLURB, arg_image);
3016 *home_device = home;
3019 *home_device_rw = home_rw;
3026 *srv_device_rw = srv_rw;
3031 log_error("--image= is not supported, compiled without blkid support.");
3036 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3038 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3039 const char *fstype, *p;
3049 p = strappenda(where, directory);
3054 b = blkid_new_probe_from_filename(what);
3058 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3062 blkid_probe_enable_superblocks(b, 1);
3063 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3066 r = blkid_do_safeprobe(b);
3067 if (r == -1 || r == 1) {
3068 log_error("Cannot determine file system type of %s", what);
3070 } else if (r != 0) {
3073 log_error_errno(errno, "Failed to probe %s: %m", what);
3078 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3081 log_error("Failed to determine file system type of %s", what);
3085 if (streq(fstype, "crypto_LUKS")) {
3086 log_error("nspawn currently does not support LUKS disk images.");
3090 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3091 return log_error_errno(errno, "Failed to mount %s: %m", what);
3095 log_error("--image= is not supported, compiled without blkid support.");
3100 static int mount_devices(
3102 const char *root_device, bool root_device_rw,
3103 const char *home_device, bool home_device_rw,
3104 const char *srv_device, bool srv_device_rw) {
3110 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3112 return log_error_errno(r, "Failed to mount root directory: %m");
3116 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3118 return log_error_errno(r, "Failed to mount home directory: %m");
3122 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3124 return log_error_errno(r, "Failed to mount server data directory: %m");
3130 static void loop_remove(int nr, int *image_fd) {
3131 _cleanup_close_ int control = -1;
3137 if (image_fd && *image_fd >= 0) {
3138 r = ioctl(*image_fd, LOOP_CLR_FD);
3140 log_debug_errno(errno, "Failed to close loop image: %m");
3141 *image_fd = safe_close(*image_fd);
3144 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3146 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3150 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3152 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3155 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3163 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3164 return log_error_errno(errno, "Failed to allocate pipe: %m");
3168 return log_error_errno(errno, "Failed to fork getent child: %m");
3169 else if (pid == 0) {
3171 char *empty_env = NULL;
3173 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3174 _exit(EXIT_FAILURE);
3176 if (pipe_fds[0] > 2)
3177 safe_close(pipe_fds[0]);
3178 if (pipe_fds[1] > 2)
3179 safe_close(pipe_fds[1]);
3181 nullfd = open("/dev/null", O_RDWR);
3183 _exit(EXIT_FAILURE);
3185 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3186 _exit(EXIT_FAILURE);
3188 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3189 _exit(EXIT_FAILURE);
3194 reset_all_signal_handlers();
3195 close_all_fds(NULL, 0);
3197 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3198 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3199 _exit(EXIT_FAILURE);
3202 pipe_fds[1] = safe_close(pipe_fds[1]);
3209 static int change_uid_gid(char **_home) {
3210 char line[LINE_MAX], *x, *u, *g, *h;
3211 const char *word, *state;
3212 _cleanup_free_ uid_t *uids = NULL;
3213 _cleanup_free_ char *home = NULL;
3214 _cleanup_fclose_ FILE *f = NULL;
3215 _cleanup_close_ int fd = -1;
3216 unsigned n_uids = 0;
3225 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3226 /* Reset everything fully to 0, just in case */
3228 if (setgroups(0, NULL) < 0)
3229 return log_error_errno(errno, "setgroups() failed: %m");
3231 if (setresgid(0, 0, 0) < 0)
3232 return log_error_errno(errno, "setregid() failed: %m");
3234 if (setresuid(0, 0, 0) < 0)
3235 return log_error_errno(errno, "setreuid() failed: %m");
3241 /* First, get user credentials */
3242 fd = spawn_getent("passwd", arg_user, &pid);
3246 f = fdopen(fd, "r");
3251 if (!fgets(line, sizeof(line), f)) {
3254 log_error("Failed to resolve user %s.", arg_user);
3258 log_error_errno(errno, "Failed to read from getent: %m");
3264 wait_for_terminate_and_warn("getent passwd", pid, true);
3266 x = strchr(line, ':');
3268 log_error("/etc/passwd entry has invalid user field.");
3272 u = strchr(x+1, ':');
3274 log_error("/etc/passwd entry has invalid password field.");
3281 log_error("/etc/passwd entry has invalid UID field.");
3289 log_error("/etc/passwd entry has invalid GID field.");
3294 h = strchr(x+1, ':');
3296 log_error("/etc/passwd entry has invalid GECOS field.");
3303 log_error("/etc/passwd entry has invalid home directory field.");
3309 r = parse_uid(u, &uid);
3311 log_error("Failed to parse UID of user.");
3315 r = parse_gid(g, &gid);
3317 log_error("Failed to parse GID of user.");
3325 /* Second, get group memberships */
3326 fd = spawn_getent("initgroups", arg_user, &pid);
3331 f = fdopen(fd, "r");
3336 if (!fgets(line, sizeof(line), f)) {
3338 log_error("Failed to resolve user %s.", arg_user);
3342 log_error_errno(errno, "Failed to read from getent: %m");
3348 wait_for_terminate_and_warn("getent initgroups", pid, true);
3350 /* Skip over the username and subsequent separator whitespace */
3352 x += strcspn(x, WHITESPACE);
3353 x += strspn(x, WHITESPACE);
3355 FOREACH_WORD(word, l, x, state) {
3361 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3364 r = parse_uid(c, &uids[n_uids++]);
3366 log_error("Failed to parse group data from getent.");
3371 r = mkdir_parents(home, 0775);
3373 return log_error_errno(r, "Failed to make home root directory: %m");
3375 r = mkdir_safe(home, 0755, uid, gid);
3376 if (r < 0 && r != -EEXIST)
3377 return log_error_errno(r, "Failed to make home directory: %m");
3379 fchown(STDIN_FILENO, uid, gid);
3380 fchown(STDOUT_FILENO, uid, gid);
3381 fchown(STDERR_FILENO, uid, gid);
3383 if (setgroups(n_uids, uids) < 0)
3384 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3386 if (setresgid(gid, gid, gid) < 0)
3387 return log_error_errno(errno, "setregid() failed: %m");
3389 if (setresuid(uid, uid, uid) < 0)
3390 return log_error_errno(errno, "setreuid() failed: %m");
3402 * < 0 : wait_for_terminate() failed to get the state of the
3403 * container, the container was terminated by a signal, or
3404 * failed for an unknown reason. No change is made to the
3405 * container argument.
3406 * > 0 : The program executed in the container terminated with an
3407 * error. The exit code of the program executed in the
3408 * container is returned. The container argument has been set
3409 * to CONTAINER_TERMINATED.
3410 * 0 : The container is being rebooted, has been shut down or exited
3411 * successfully. The container argument has been set to either
3412 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3414 * That is, success is indicated by a return value of zero, and an
3415 * error is indicated by a non-zero value.
3417 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3421 r = wait_for_terminate(pid, &status);
3423 return log_warning_errno(r, "Failed to wait for container: %m");
3425 switch (status.si_code) {
3428 if (status.si_status == 0) {
3429 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3432 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3434 *container = CONTAINER_TERMINATED;
3435 return status.si_status;
3438 if (status.si_status == SIGINT) {
3440 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3441 *container = CONTAINER_TERMINATED;
3444 } else if (status.si_status == SIGHUP) {
3446 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3447 *container = CONTAINER_REBOOTED;
3451 /* CLD_KILLED fallthrough */
3454 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3458 log_error("Container %s failed due to unknown reason.", arg_machine);
3465 static void nop_handler(int sig) {}
3467 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3470 pid = PTR_TO_UINT32(userdata);
3472 if (kill(pid, SIGRTMIN+3) >= 0) {
3473 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3474 sd_event_source_set_userdata(s, NULL);
3479 sd_event_exit(sd_event_source_get_event(s), 0);
3483 static int determine_names(void) {
3486 if (!arg_image && !arg_directory) {
3488 _cleanup_(image_unrefp) Image *i = NULL;
3490 r = image_find(arg_machine, &i);
3492 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3494 log_error("No image for machine '%s': %m", arg_machine);
3498 if (i->type == IMAGE_RAW)
3499 r = set_sanitized_path(&arg_image, i->path);
3501 r = set_sanitized_path(&arg_directory, i->path);
3503 return log_error_errno(r, "Invalid image directory: %m");
3505 arg_read_only = arg_read_only || i->read_only;
3507 arg_directory = get_current_dir_name();
3509 if (!arg_directory && !arg_machine) {
3510 log_error("Failed to determine path, please use -D or -i.");
3516 if (arg_directory && path_equal(arg_directory, "/"))
3517 arg_machine = gethostname_malloc();
3519 arg_machine = strdup(basename(arg_image ?: arg_directory));
3524 hostname_cleanup(arg_machine, false);
3525 if (!machine_name_is_valid(arg_machine)) {
3526 log_error("Failed to determine machine name automatically, please use -M.");
3530 if (arg_ephemeral) {
3533 /* Add a random suffix when this is an
3534 * ephemeral machine, so that we can run many
3535 * instances at once without manually having
3536 * to specify -M each time. */
3538 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3549 int main(int argc, char *argv[]) {
3551 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3552 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3553 _cleanup_close_ int master = -1, image_fd = -1;
3554 _cleanup_fdset_free_ FDSet *fds = NULL;
3555 int r, n_fd_passed, loop_nr = -1;
3556 char veth_name[IFNAMSIZ];
3557 bool secondary = false, remove_subvol = false;
3558 sigset_t mask, mask_chld;
3560 int ret = EXIT_SUCCESS;
3561 union in_addr_union exposed = {};
3562 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3564 log_parse_environment();
3567 r = parse_argv(argc, argv);
3571 r = determine_names();
3575 if (geteuid() != 0) {
3576 log_error("Need to be root.");
3581 if (sd_booted() <= 0) {
3582 log_error("Not running on a systemd system.");
3588 n_fd_passed = sd_listen_fds(false);
3589 if (n_fd_passed > 0) {
3590 r = fdset_new_listen_fds(&fds, false);
3592 log_error_errno(r, "Failed to collect file descriptors: %m");
3596 fdset_close_others(fds);
3599 if (arg_directory) {
3602 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3603 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3608 if (arg_ephemeral) {
3609 _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3612 /* If the specified path is a mount point we
3613 * generate the new snapshot immediately
3614 * inside it under a random name. However if
3615 * the specified is not a mount point we
3616 * create the new snapshot in the parent
3617 * directory, just next to it. */
3618 r = path_is_mount_point(arg_directory, false);
3620 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3624 r = tempfn_random_child(arg_directory, &np);
3626 r = tempfn_random(arg_directory, &np);
3628 log_error_errno(r, "Failed to generate name for snapshot: %m");
3632 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3634 log_error_errno(r, "Failed to lock %s: %m", np);
3638 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3641 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3645 free(arg_directory);
3648 remove_subvol = true;
3651 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3653 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3657 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3662 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3665 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3667 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3671 log_info("Populated %s from template %s.", arg_directory, arg_template);
3677 if (path_is_os_tree(arg_directory) <= 0) {
3678 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3685 p = strappenda(arg_directory,
3686 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3687 if (access(p, F_OK) < 0) {
3688 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3695 char template[] = "/tmp/nspawn-root-XXXXXX";
3698 assert(!arg_template);
3700 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3702 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3706 r = log_error_errno(r, "Failed to create image lock: %m");
3710 if (!mkdtemp(template)) {
3711 log_error_errno(errno, "Failed to create temporary directory: %m");
3716 arg_directory = strdup(template);
3717 if (!arg_directory) {
3722 image_fd = setup_image(&device_path, &loop_nr);
3728 r = dissect_image(image_fd,
3729 &root_device, &root_device_rw,
3730 &home_device, &home_device_rw,
3731 &srv_device, &srv_device_rw,
3737 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3739 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3743 r = ptsname_malloc(master, &console);
3745 r = log_error_errno(r, "Failed to determine tty name: %m");
3750 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3751 arg_machine, arg_image ?: arg_directory);
3753 if (unlockpt(master) < 0) {
3754 r = log_error_errno(errno, "Failed to unlock tty: %m");
3758 assert_se(sigemptyset(&mask) == 0);
3759 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3760 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3762 assert_se(sigemptyset(&mask_chld) == 0);
3763 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3766 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3767 ContainerStatus container_status;
3768 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3769 struct sigaction sa = {
3770 .sa_handler = nop_handler,
3771 .sa_flags = SA_NOCLDSTOP,
3774 r = barrier_create(&barrier);
3776 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3780 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3781 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3785 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3786 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3790 /* Child can be killed before execv(), so handle SIGCHLD
3791 * in order to interrupt parent's blocking calls and
3792 * give it a chance to call wait() and terminate. */
3793 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3795 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3799 r = sigaction(SIGCHLD, &sa, NULL);
3801 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3805 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3806 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3807 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3809 if (errno == EINVAL)
3810 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3812 r = log_error_errno(errno, "clone() failed: %m");
3819 _cleanup_free_ char *home = NULL;
3821 const char *envp[] = {
3822 "PATH=" DEFAULT_PATH_SPLIT_USR,
3823 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3828 NULL, /* container_uuid */
3829 NULL, /* LISTEN_FDS */
3830 NULL, /* LISTEN_PID */
3835 barrier_set_role(&barrier, BARRIER_CHILD);
3837 envp[n_env] = strv_find_prefix(environ, "TERM=");
3841 master = safe_close(master);
3843 close_nointr(STDIN_FILENO);
3844 close_nointr(STDOUT_FILENO);
3845 close_nointr(STDERR_FILENO);
3847 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3848 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3850 reset_all_signal_handlers();
3851 reset_signal_mask();
3853 r = open_terminal(console, O_RDWR);
3854 if (r != STDIN_FILENO) {
3860 log_error_errno(r, "Failed to open console: %m");
3861 _exit(EXIT_FAILURE);
3864 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3865 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3866 log_error_errno(errno, "Failed to duplicate console: %m");
3867 _exit(EXIT_FAILURE);
3871 log_error_errno(errno, "setsid() failed: %m");
3872 _exit(EXIT_FAILURE);
3875 if (reset_audit_loginuid() < 0)
3876 _exit(EXIT_FAILURE);
3878 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3879 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3880 _exit(EXIT_FAILURE);
3883 /* Mark everything as slave, so that we still
3884 * receive mounts from the real root, but don't
3885 * propagate mounts to the real root. */
3886 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3887 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3888 _exit(EXIT_FAILURE);
3891 if (mount_devices(arg_directory,
3892 root_device, root_device_rw,
3893 home_device, home_device_rw,
3894 srv_device, srv_device_rw) < 0)
3895 _exit(EXIT_FAILURE);
3897 /* Turn directory into bind mount */
3898 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3899 log_error_errno(errno, "Failed to make bind mount: %m");
3900 _exit(EXIT_FAILURE);
3903 r = setup_volatile(arg_directory);
3905 _exit(EXIT_FAILURE);
3907 if (setup_volatile_state(arg_directory) < 0)
3908 _exit(EXIT_FAILURE);
3910 r = base_filesystem_create(arg_directory);
3912 _exit(EXIT_FAILURE);
3914 if (arg_read_only) {
3915 r = bind_remount_recursive(arg_directory, true);
3917 log_error_errno(r, "Failed to make tree read-only: %m");
3918 _exit(EXIT_FAILURE);
3922 if (mount_all(arg_directory) < 0)
3923 _exit(EXIT_FAILURE);
3925 if (copy_devnodes(arg_directory) < 0)
3926 _exit(EXIT_FAILURE);
3928 if (setup_ptmx(arg_directory) < 0)
3929 _exit(EXIT_FAILURE);
3931 dev_setup(arg_directory);
3933 if (setup_propagate(arg_directory) < 0)
3934 _exit(EXIT_FAILURE);
3936 if (setup_seccomp() < 0)
3937 _exit(EXIT_FAILURE);
3939 if (setup_dev_console(arg_directory, console) < 0)
3940 _exit(EXIT_FAILURE);
3942 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3943 _exit(EXIT_FAILURE);
3944 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3946 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3947 _exit(EXIT_FAILURE);
3948 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3950 /* Tell the parent that we are ready, and that
3951 * it can cgroupify us to that we lack access
3952 * to certain devices and resources. */
3953 (void) barrier_place(&barrier);
3955 if (setup_boot_id(arg_directory) < 0)
3956 _exit(EXIT_FAILURE);
3958 if (setup_timezone(arg_directory) < 0)
3959 _exit(EXIT_FAILURE);
3961 if (setup_resolv_conf(arg_directory) < 0)
3962 _exit(EXIT_FAILURE);
3964 if (setup_journal(arg_directory) < 0)
3965 _exit(EXIT_FAILURE);
3967 if (mount_binds(arg_directory, arg_bind, false) < 0)
3968 _exit(EXIT_FAILURE);
3970 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3971 _exit(EXIT_FAILURE);
3973 if (mount_tmpfs(arg_directory) < 0)
3974 _exit(EXIT_FAILURE);
3976 /* Wait until we are cgroup-ified, so that we
3977 * can mount the right cgroup path writable */
3978 (void) barrier_sync_next(&barrier);
3980 if (mount_cgroup(arg_directory) < 0)
3981 _exit(EXIT_FAILURE);
3983 if (chdir(arg_directory) < 0) {
3984 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3985 _exit(EXIT_FAILURE);
3988 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3989 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3990 _exit(EXIT_FAILURE);
3993 if (chroot(".") < 0) {
3994 log_error_errno(errno, "chroot() failed: %m");
3995 _exit(EXIT_FAILURE);
3998 if (chdir("/") < 0) {
3999 log_error_errno(errno, "chdir() failed: %m");
4000 _exit(EXIT_FAILURE);
4005 if (arg_private_network)
4008 if (drop_capabilities() < 0) {
4009 log_error_errno(errno, "drop_capabilities() failed: %m");
4010 _exit(EXIT_FAILURE);
4013 r = change_uid_gid(&home);
4015 _exit(EXIT_FAILURE);
4017 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4018 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4019 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4021 _exit(EXIT_FAILURE);
4024 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4027 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4029 _exit(EXIT_FAILURE);
4033 if (fdset_size(fds) > 0) {
4034 r = fdset_cloexec(fds, false);
4036 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4037 _exit(EXIT_FAILURE);
4040 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4041 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4043 _exit(EXIT_FAILURE);
4049 if (arg_personality != 0xffffffffLU) {
4050 if (personality(arg_personality) < 0) {
4051 log_error_errno(errno, "personality() failed: %m");
4052 _exit(EXIT_FAILURE);
4054 } else if (secondary) {
4055 if (personality(PER_LINUX32) < 0) {
4056 log_error_errno(errno, "personality() failed: %m");
4057 _exit(EXIT_FAILURE);
4062 if (arg_selinux_context)
4063 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4064 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4065 _exit(EXIT_FAILURE);
4069 if (!strv_isempty(arg_setenv)) {
4072 n = strv_env_merge(2, envp, arg_setenv);
4075 _exit(EXIT_FAILURE);
4080 env_use = (char**) envp;
4082 /* Wait until the parent is ready with the setup, too... */
4083 if (!barrier_place_and_sync(&barrier))
4084 _exit(EXIT_FAILURE);
4090 /* Automatically search for the init system */
4092 l = 1 + argc - optind;
4093 a = newa(char*, l + 1);
4094 memcpy(a + 1, argv + optind, l * sizeof(char*));
4096 a[0] = (char*) "/usr/lib/systemd/systemd";
4097 execve(a[0], a, env_use);
4099 a[0] = (char*) "/lib/systemd/systemd";
4100 execve(a[0], a, env_use);
4102 a[0] = (char*) "/sbin/init";
4103 execve(a[0], a, env_use);
4104 } else if (argc > optind)
4105 execvpe(argv[optind], argv + optind, env_use);
4107 chdir(home ? home : "/root");
4108 execle("/bin/bash", "-bash", NULL, env_use);
4109 execle("/bin/sh", "-sh", NULL, env_use);
4112 log_error_errno(errno, "execv() failed: %m");
4113 _exit(EXIT_FAILURE);
4116 barrier_set_role(&barrier, BARRIER_PARENT);
4120 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4121 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4123 /* Wait for the most basic Child-setup to be done,
4124 * before we add hardware to it, and place it in a
4126 if (barrier_sync_next(&barrier)) {
4129 r = move_network_interfaces(pid);
4133 r = setup_veth(pid, veth_name, &ifi);
4137 r = setup_bridge(veth_name, &ifi);
4141 r = setup_macvlan(pid);
4145 r = setup_ipvlan(pid);
4149 r = register_machine(pid, ifi);
4153 /* Block SIGCHLD here, before notifying child.
4154 * process_pty() will handle it with the other signals. */
4155 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4159 /* Reset signal to default */
4160 r = default_signals(SIGCHLD, -1);
4164 /* Notify the child that the parent is ready with all
4165 * its setup, and that the child can now hand over
4166 * control to the code to run inside the container. */
4167 (void) barrier_place(&barrier);
4169 /* And wait that the child is completely ready now. */
4170 if (barrier_place_and_sync(&barrier)) {
4171 _cleanup_event_unref_ sd_event *event = NULL;
4172 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4173 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4178 "STATUS=Container running.\n"
4179 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4181 r = sd_event_new(&event);
4183 log_error_errno(r, "Failed to get default event source: %m");
4188 /* Try to kill the init system on SIGINT or SIGTERM */
4189 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4190 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4192 /* Immediately exit */
4193 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4194 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4197 /* simply exit on sigchld */
4198 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4200 if (arg_expose_ports) {
4201 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4205 (void) expose_ports(rtnl, &exposed);
4208 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4210 r = pty_forward_new(event, master, true, &forward);
4212 log_error_errno(r, "Failed to create PTY forwarder: %m");
4216 r = sd_event_loop(event);
4218 log_error_errno(r, "Failed to run event loop: %m");
4222 pty_forward_get_last_char(forward, &last_char);
4224 forward = pty_forward_free(forward);
4226 if (!arg_quiet && last_char != '\n')
4229 /* Kill if it is not dead yet anyway */
4230 terminate_machine(pid);
4234 /* Normally redundant, but better safe than sorry */
4237 r = wait_for_container(pid, &container_status);
4241 /* We failed to wait for the container, or the
4242 * container exited abnormally */
4244 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4245 /* The container exited with a non-zero
4246 * status, or with zero status and no reboot
4252 /* CONTAINER_REBOOTED, loop again */
4254 if (arg_keep_unit) {
4255 /* Special handling if we are running as a
4256 * service: instead of simply restarting the
4257 * machine we want to restart the entire
4258 * service, so let's inform systemd about this
4259 * with the special exit code 133. The service
4260 * file uses RestartForceExitStatus=133 so
4261 * that this results in a full nspawn
4262 * restart. This is necessary since we might
4263 * have cgroup parameters set we want to have
4270 flush_ports(&exposed);
4276 "STATUS=Terminating...");
4278 loop_remove(loop_nr, &image_fd);
4283 if (remove_subvol && arg_directory) {
4286 k = btrfs_subvol_remove(arg_directory);
4288 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4294 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4295 (void) rm_rf(p, false, true, false);
4298 free(arg_directory);
4303 strv_free(arg_setenv);
4304 strv_free(arg_network_interfaces);
4305 strv_free(arg_network_macvlan);
4306 strv_free(arg_network_ipvlan);
4307 strv_free(arg_bind);
4308 strv_free(arg_bind_ro);
4309 strv_free(arg_tmpfs);
4311 flush_ports(&exposed);
4313 while (arg_expose_ports) {
4314 ExposePort *p = arg_expose_ports;
4315 LIST_REMOVE(ports, arg_expose_ports, p);
4319 return r < 0 ? EXIT_FAILURE : ret;