1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
36 #include <sys/signalfd.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
48 #include <selinux/selinux.h>
56 #include <blkid/blkid.h>
59 #include "sd-daemon.h"
69 #include "cgroup-util.h"
71 #include "path-util.h"
72 #include "loopback-setup.h"
73 #include "dev-setup.h"
78 #include "bus-error.h"
80 #include "bus-kernel.h"
83 #include "rtnl-util.h"
84 #include "udev-util.h"
85 #include "blkid-util.h"
87 #include "siphash24.h"
89 #include "base-filesystem.h"
91 #include "event-util.h"
92 #include "capability.h"
94 #include "btrfs-util.h"
95 #include "machine-image.h"
97 #include "in-addr-util.h"
99 #include "local-addresses.h"
102 #include "seccomp-util.h"
105 typedef struct ExposePort {
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
117 typedef enum LinkJournal {
124 typedef enum Volatile {
130 static char *arg_directory = NULL;
131 static char *arg_template = NULL;
132 static char *arg_user = NULL;
133 static sd_id128_t arg_uuid = {};
134 static char *arg_machine = NULL;
135 static const char *arg_selinux_context = NULL;
136 static const char *arg_selinux_apifs_context = NULL;
137 static const char *arg_slice = NULL;
138 static bool arg_private_network = false;
139 static bool arg_read_only = false;
140 static bool arg_boot = false;
141 static bool arg_ephemeral = false;
142 static LinkJournal arg_link_journal = LINK_AUTO;
143 static bool arg_link_journal_try = false;
144 static uint64_t arg_retain =
145 (1ULL << CAP_CHOWN) |
146 (1ULL << CAP_DAC_OVERRIDE) |
147 (1ULL << CAP_DAC_READ_SEARCH) |
148 (1ULL << CAP_FOWNER) |
149 (1ULL << CAP_FSETID) |
150 (1ULL << CAP_IPC_OWNER) |
152 (1ULL << CAP_LEASE) |
153 (1ULL << CAP_LINUX_IMMUTABLE) |
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
157 (1ULL << CAP_SETGID) |
158 (1ULL << CAP_SETFCAP) |
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
162 (1ULL << CAP_SYS_CHROOT) |
163 (1ULL << CAP_SYS_NICE) |
164 (1ULL << CAP_SYS_PTRACE) |
165 (1ULL << CAP_SYS_TTY_CONFIG) |
166 (1ULL << CAP_SYS_RESOURCE) |
167 (1ULL << CAP_SYS_BOOT) |
168 (1ULL << CAP_AUDIT_WRITE) |
169 (1ULL << CAP_AUDIT_CONTROL) |
171 static char **arg_bind = NULL;
172 static char **arg_bind_ro = NULL;
173 static char **arg_tmpfs = NULL;
174 static char **arg_setenv = NULL;
175 static bool arg_quiet = false;
176 static bool arg_share_system = false;
177 static bool arg_register = true;
178 static bool arg_keep_unit = false;
179 static char **arg_network_interfaces = NULL;
180 static char **arg_network_macvlan = NULL;
181 static bool arg_network_veth = false;
182 static const char *arg_network_bridge = NULL;
183 static unsigned long arg_personality = 0xffffffffLU;
184 static char *arg_image = NULL;
185 static Volatile arg_volatile = VOLATILE_NO;
186 static ExposePort *arg_expose_ports = NULL;
188 static void help(void) {
189 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
190 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
191 " -h --help Show this help\n"
192 " --version Print version string\n"
193 " -q --quiet Do not show status information\n"
194 " -D --directory=PATH Root directory for the container\n"
195 " --template=PATH Initialize root directory from template directory,\n"
197 " -x --ephemeral Run container with snapshot of root directory, and\n"
198 " remove it after exit\n"
199 " -i --image=PATH File system device or disk image for the container\n"
200 " -b --boot Boot up full system (i.e. invoke init)\n"
201 " -u --user=USER Run the command under specified user or uid\n"
202 " -M --machine=NAME Set the machine name for the container\n"
203 " --uuid=UUID Set a specific machine UUID for the container\n"
204 " -S --slice=SLICE Place the container in the specified slice\n"
205 " --private-network Disable network in container\n"
206 " --network-interface=INTERFACE\n"
207 " Assign an existing network interface to the\n"
209 " --network-macvlan=INTERFACE\n"
210 " Create a macvlan network interface based on an\n"
211 " existing network interface to the container\n"
212 " --network-veth Add a virtual ethernet connection between host\n"
214 " --network-bridge=INTERFACE\n"
215 " Add a virtual ethernet connection between host\n"
216 " and container and add it to an existing bridge on\n"
218 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
219 " Expose a container IP port ont the host\n"
220 " -Z --selinux-context=SECLABEL\n"
221 " Set the SELinux security context to be used by\n"
222 " processes in the container\n"
223 " -L --selinux-apifs-context=SECLABEL\n"
224 " Set the SELinux security context to be used by\n"
225 " API/tmpfs file systems in the container\n"
226 " --capability=CAP In addition to the default, retain specified\n"
228 " --drop-capability=CAP Drop the specified capability from the default set\n"
229 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
230 " try-guest, try-host\n"
231 " -j Equivalent to --link-journal=try-guest\n"
232 " --read-only Mount the root directory read-only\n"
233 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
235 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
236 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
237 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
238 " --share-system Share system namespaces with host\n"
239 " --register=BOOLEAN Register container as machine\n"
240 " --keep-unit Do not register a scope for the machine, reuse\n"
241 " the service unit nspawn is running in\n"
242 " --volatile[=MODE] Run the system in volatile mode\n"
243 , program_invocation_short_name);
246 static int set_sanitized_path(char **b, const char *path) {
252 p = canonicalize_file_name(path);
257 p = path_make_absolute_cwd(path);
263 *b = path_kill_slashes(p);
267 static int parse_argv(int argc, char *argv[]) {
284 ARG_NETWORK_INTERFACE,
293 static const struct option options[] = {
294 { "help", no_argument, NULL, 'h' },
295 { "version", no_argument, NULL, ARG_VERSION },
296 { "directory", required_argument, NULL, 'D' },
297 { "template", required_argument, NULL, ARG_TEMPLATE },
298 { "ephemeral", no_argument, NULL, 'x' },
299 { "user", required_argument, NULL, 'u' },
300 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
301 { "boot", no_argument, NULL, 'b' },
302 { "uuid", required_argument, NULL, ARG_UUID },
303 { "read-only", no_argument, NULL, ARG_READ_ONLY },
304 { "capability", required_argument, NULL, ARG_CAPABILITY },
305 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
306 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
307 { "bind", required_argument, NULL, ARG_BIND },
308 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
309 { "tmpfs", required_argument, NULL, ARG_TMPFS },
310 { "machine", required_argument, NULL, 'M' },
311 { "slice", required_argument, NULL, 'S' },
312 { "setenv", required_argument, NULL, ARG_SETENV },
313 { "selinux-context", required_argument, NULL, 'Z' },
314 { "selinux-apifs-context", required_argument, NULL, 'L' },
315 { "quiet", no_argument, NULL, 'q' },
316 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
317 { "register", required_argument, NULL, ARG_REGISTER },
318 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
319 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
320 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
321 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
322 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
323 { "personality", required_argument, NULL, ARG_PERSONALITY },
324 { "image", required_argument, NULL, 'i' },
325 { "volatile", optional_argument, NULL, ARG_VOLATILE },
326 { "port", required_argument, NULL, 'p' },
331 uint64_t plus = 0, minus = 0;
336 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:", options, NULL)) >= 0)
345 puts(PACKAGE_STRING);
346 puts(SYSTEMD_FEATURES);
350 r = set_sanitized_path(&arg_directory, optarg);
352 return log_error_errno(r, "Invalid root directory: %m");
357 r = set_sanitized_path(&arg_template, optarg);
359 return log_error_errno(r, "Invalid template directory: %m");
364 r = set_sanitized_path(&arg_image, optarg);
366 return log_error_errno(r, "Invalid image path: %m");
371 arg_ephemeral = true;
376 arg_user = strdup(optarg);
382 case ARG_NETWORK_BRIDGE:
383 arg_network_bridge = optarg;
387 case ARG_NETWORK_VETH:
388 arg_network_veth = true;
389 arg_private_network = true;
392 case ARG_NETWORK_INTERFACE:
393 if (strv_extend(&arg_network_interfaces, optarg) < 0)
396 arg_private_network = true;
399 case ARG_NETWORK_MACVLAN:
400 if (strv_extend(&arg_network_macvlan, optarg) < 0)
405 case ARG_PRIVATE_NETWORK:
406 arg_private_network = true;
414 r = sd_id128_from_string(optarg, &arg_uuid);
416 log_error("Invalid UUID: %s", optarg);
426 if (isempty(optarg)) {
430 if (!machine_name_is_valid(optarg)) {
431 log_error("Invalid machine name: %s", optarg);
435 r = free_and_strdup(&arg_machine, optarg);
443 arg_selinux_context = optarg;
447 arg_selinux_apifs_context = optarg;
451 arg_read_only = true;
455 case ARG_DROP_CAPABILITY: {
456 const char *state, *word;
459 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
460 _cleanup_free_ char *t;
462 t = strndup(word, length);
466 if (streq(t, "all")) {
467 if (c == ARG_CAPABILITY)
468 plus = (uint64_t) -1;
470 minus = (uint64_t) -1;
474 cap = capability_from_name(t);
476 log_error("Failed to parse capability %s.", t);
480 if (c == ARG_CAPABILITY)
481 plus |= 1ULL << (uint64_t) cap;
483 minus |= 1ULL << (uint64_t) cap;
491 arg_link_journal = LINK_GUEST;
492 arg_link_journal_try = true;
495 case ARG_LINK_JOURNAL:
496 if (streq(optarg, "auto")) {
497 arg_link_journal = LINK_AUTO;
498 arg_link_journal_try = false;
499 } else if (streq(optarg, "no")) {
500 arg_link_journal = LINK_NO;
501 arg_link_journal_try = false;
502 } else if (streq(optarg, "guest")) {
503 arg_link_journal = LINK_GUEST;
504 arg_link_journal_try = false;
505 } else if (streq(optarg, "host")) {
506 arg_link_journal = LINK_HOST;
507 arg_link_journal_try = false;
508 } else if (streq(optarg, "try-guest")) {
509 arg_link_journal = LINK_GUEST;
510 arg_link_journal_try = true;
511 } else if (streq(optarg, "try-host")) {
512 arg_link_journal = LINK_HOST;
513 arg_link_journal_try = true;
515 log_error("Failed to parse link journal mode %s", optarg);
523 _cleanup_free_ char *a = NULL, *b = NULL;
527 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
529 e = strchr(optarg, ':');
531 a = strndup(optarg, e - optarg);
541 if (!path_is_absolute(a) || !path_is_absolute(b)) {
542 log_error("Invalid bind mount specification: %s", optarg);
546 r = strv_extend(x, a);
550 r = strv_extend(x, b);
558 _cleanup_free_ char *a = NULL, *b = NULL;
561 e = strchr(optarg, ':');
563 a = strndup(optarg, e - optarg);
567 b = strdup("mode=0755");
573 if (!path_is_absolute(a)) {
574 log_error("Invalid tmpfs specification: %s", optarg);
578 r = strv_push(&arg_tmpfs, a);
584 r = strv_push(&arg_tmpfs, b);
596 if (!env_assignment_is_valid(optarg)) {
597 log_error("Environment variable assignment '%s' is not valid.", optarg);
601 n = strv_env_set(arg_setenv, optarg);
605 strv_free(arg_setenv);
614 case ARG_SHARE_SYSTEM:
615 arg_share_system = true;
619 r = parse_boolean(optarg);
621 log_error("Failed to parse --register= argument: %s", optarg);
629 arg_keep_unit = true;
632 case ARG_PERSONALITY:
634 arg_personality = personality_from_string(optarg);
635 if (arg_personality == 0xffffffffLU) {
636 log_error("Unknown or unsupported personality '%s'.", optarg);
645 arg_volatile = VOLATILE_YES;
647 r = parse_boolean(optarg);
649 if (streq(optarg, "state"))
650 arg_volatile = VOLATILE_STATE;
652 log_error("Failed to parse --volatile= argument: %s", optarg);
656 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
662 const char *split, *e;
663 uint16_t container_port, host_port;
667 if ((e = startswith(optarg, "tcp:")))
668 protocol = IPPROTO_TCP;
669 else if ((e = startswith(optarg, "udp:")))
670 protocol = IPPROTO_UDP;
673 protocol = IPPROTO_TCP;
676 split = strchr(e, ':');
678 char v[split - e + 1];
680 memcpy(v, e, split - e);
683 r = safe_atou16(v, &host_port);
684 if (r < 0 || host_port <= 0) {
685 log_error("Failed to parse host port: %s", optarg);
689 r = safe_atou16(split + 1, &container_port);
691 r = safe_atou16(e, &container_port);
692 host_port = container_port;
695 if (r < 0 || container_port <= 0) {
696 log_error("Failed to parse host port: %s", optarg);
700 LIST_FOREACH(ports, p, arg_expose_ports) {
701 if (p->protocol == protocol && p->host_port == host_port) {
702 log_error("Duplicate port specification: %s", optarg);
707 p = new(ExposePort, 1);
711 p->protocol = protocol;
712 p->host_port = host_port;
713 p->container_port = container_port;
715 LIST_PREPEND(ports, arg_expose_ports, p);
724 assert_not_reached("Unhandled option");
727 if (arg_share_system)
728 arg_register = false;
730 if (arg_boot && arg_share_system) {
731 log_error("--boot and --share-system may not be combined.");
735 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
736 log_error("--keep-unit may not be used when invoked from a user session.");
740 if (arg_directory && arg_image) {
741 log_error("--directory= and --image= may not be combined.");
745 if (arg_template && arg_image) {
746 log_error("--template= and --image= may not be combined.");
750 if (arg_template && !(arg_directory || arg_machine)) {
751 log_error("--template= needs --directory= or --machine=.");
755 if (arg_ephemeral && arg_template) {
756 log_error("--ephemeral and --template= may not be combined.");
760 if (arg_ephemeral && arg_image) {
761 log_error("--ephemeral and --image= may not be combined.");
765 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
766 log_error("--ephemeral and --link-journal= may not be combined.");
770 if (arg_volatile != VOLATILE_NO && arg_read_only) {
771 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
775 if (arg_expose_ports && !arg_private_network) {
776 log_error("Cannot use --port= without private networking.");
780 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
785 static int mount_all(const char *dest) {
787 typedef struct MountPoint {
796 static const MountPoint mount_table[] = {
797 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
798 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
799 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
800 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
801 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
802 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
803 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
804 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
806 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
807 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
814 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
815 _cleanup_free_ char *where = NULL;
817 _cleanup_free_ char *options = NULL;
822 where = strjoin(dest, "/", mount_table[k].where, NULL);
826 t = path_is_mount_point(where, true);
828 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
836 /* Skip this entry if it is not a remount. */
837 if (mount_table[k].what && t > 0)
840 t = mkdir_p(where, 0755);
842 if (mount_table[k].fatal) {
843 log_error_errno(t, "Failed to create directory %s: %m", where);
848 log_warning_errno(t, "Failed to create directory %s: %m", where);
854 if (arg_selinux_apifs_context &&
855 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
856 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
863 o = mount_table[k].options;
866 if (mount(mount_table[k].what,
869 mount_table[k].flags,
872 if (mount_table[k].fatal) {
873 log_error_errno(errno, "mount(%s) failed: %m", where);
878 log_warning_errno(errno, "mount(%s) failed: %m", where);
885 static int mount_binds(const char *dest, char **l, bool ro) {
888 STRV_FOREACH_PAIR(x, y, l) {
889 _cleanup_free_ char *where = NULL;
890 struct stat source_st, dest_st;
893 if (stat(*x, &source_st) < 0)
894 return log_error_errno(errno, "Failed to stat %s: %m", *x);
896 where = strappend(dest, *y);
900 r = stat(where, &dest_st);
902 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
903 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
906 } else if (errno == ENOENT) {
907 r = mkdir_parents_label(where, 0755);
909 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
911 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
915 /* Create the mount point, but be conservative -- refuse to create block
916 * and char devices. */
917 if (S_ISDIR(source_st.st_mode)) {
918 r = mkdir_label(where, 0755);
919 if (r < 0 && errno != EEXIST)
920 return log_error_errno(r, "Failed to create mount point %s: %m", where);
921 } else if (S_ISFIFO(source_st.st_mode)) {
922 r = mkfifo(where, 0644);
923 if (r < 0 && errno != EEXIST)
924 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
925 } else if (S_ISSOCK(source_st.st_mode)) {
926 r = mknod(where, 0644 | S_IFSOCK, 0);
927 if (r < 0 && errno != EEXIST)
928 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
929 } else if (S_ISREG(source_st.st_mode)) {
932 return log_error_errno(r, "Failed to create mount point %s: %m", where);
934 log_error("Refusing to create mountpoint for file: %s", *x);
938 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
939 return log_error_errno(errno, "mount(%s) failed: %m", where);
942 r = bind_remount_recursive(where, true);
944 return log_error_errno(r, "Read-Only bind mount failed: %m");
951 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
955 to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
957 r = path_is_mount_point(to, false);
959 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
965 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
966 return log_error_errno(errno, "Failed to mount to %s: %m", to);
971 static int mount_cgroup(const char *dest) {
972 _cleanup_set_free_free_ Set *controllers = NULL;
973 _cleanup_free_ char *own_cgroup_path = NULL;
974 const char *cgroup_root, *systemd_root, *systemd_own;
977 controllers = set_new(&string_hash_ops);
981 r = cg_kernel_controllers(controllers);
983 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
985 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
987 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
989 cgroup_root = strappenda(dest, "/sys/fs/cgroup");
990 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
991 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
994 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
996 controller = set_steal_first(controllers);
1000 origin = strappend("/sys/fs/cgroup/", controller);
1004 r = readlink_malloc(origin, &combined);
1006 /* Not a symbolic link, but directly a single cgroup hierarchy */
1008 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1013 return log_error_errno(r, "Failed to read link %s: %m", origin);
1015 _cleanup_free_ char *target = NULL;
1017 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1021 /* A symbolic link, a combination of controllers in one hierarchy */
1023 if (!filename_is_valid(combined)) {
1024 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1028 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1032 if (symlink(combined, target) < 0)
1033 return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
1037 r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1041 /* Make our own cgroup a (writable) bind mount */
1042 systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1043 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1044 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1046 /* And then remount the systemd cgroup root read-only */
1047 systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1048 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1049 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1051 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1052 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1057 static int mount_tmpfs(const char *dest) {
1060 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1061 _cleanup_free_ char *where = NULL;
1064 where = strappend(dest, *i);
1068 r = mkdir_label(where, 0755);
1069 if (r < 0 && r != -EEXIST)
1070 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1072 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1073 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1079 static int setup_timezone(const char *dest) {
1080 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1086 /* Fix the timezone, if possible */
1087 r = readlink_malloc("/etc/localtime", &p);
1089 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1093 z = path_startswith(p, "../usr/share/zoneinfo/");
1095 z = path_startswith(p, "/usr/share/zoneinfo/");
1097 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1101 where = strappend(dest, "/etc/localtime");
1105 r = readlink_malloc(where, &q);
1107 y = path_startswith(q, "../usr/share/zoneinfo/");
1109 y = path_startswith(q, "/usr/share/zoneinfo/");
1111 /* Already pointing to the right place? Then do nothing .. */
1112 if (y && streq(y, z))
1116 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1120 if (access(check, F_OK) < 0) {
1121 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1125 what = strappend("../usr/share/zoneinfo/", z);
1129 r = mkdir_parents(where, 0755);
1131 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1137 if (r < 0 && errno != ENOENT) {
1138 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1143 if (symlink(what, where) < 0) {
1144 log_error_errno(errno, "Failed to correct timezone of container: %m");
1151 static int setup_resolv_conf(const char *dest) {
1152 _cleanup_free_ char *where = NULL;
1157 if (arg_private_network)
1160 /* Fix resolv.conf, if possible */
1161 where = strappend(dest, "/etc/resolv.conf");
1165 /* We don't really care for the results of this really. If it
1166 * fails, it fails, but meh... */
1167 r = mkdir_parents(where, 0755);
1169 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1174 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1176 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1184 static int setup_volatile_state(const char *directory) {
1190 if (arg_volatile != VOLATILE_STATE)
1193 /* --volatile=state means we simply overmount /var
1194 with a tmpfs, and the rest read-only. */
1196 r = bind_remount_recursive(directory, true);
1198 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1200 p = strappenda(directory, "/var");
1202 if (r < 0 && errno != EEXIST)
1203 return log_error_errno(errno, "Failed to create %s: %m", directory);
1205 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1206 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1211 static int setup_volatile(const char *directory) {
1212 bool tmpfs_mounted = false, bind_mounted = false;
1213 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1219 if (arg_volatile != VOLATILE_YES)
1222 /* --volatile=yes means we mount a tmpfs to the root dir, and
1223 the original /usr to use inside it, and that read-only. */
1225 if (!mkdtemp(template))
1226 return log_error_errno(errno, "Failed to create temporary directory: %m");
1228 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1229 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1234 tmpfs_mounted = true;
1236 f = strappenda(directory, "/usr");
1237 t = strappenda(template, "/usr");
1240 if (r < 0 && errno != EEXIST) {
1241 log_error_errno(errno, "Failed to create %s: %m", t);
1246 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1247 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1252 bind_mounted = true;
1254 r = bind_remount_recursive(t, true);
1256 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1260 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1261 log_error_errno(errno, "Failed to move root mount: %m");
1279 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1282 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1283 SD_ID128_FORMAT_VAL(id));
1288 static int setup_boot_id(const char *dest) {
1289 _cleanup_free_ char *from = NULL, *to = NULL;
1290 sd_id128_t rnd = {};
1296 if (arg_share_system)
1299 /* Generate a new randomized boot ID, so that each boot-up of
1300 * the container gets a new one */
1302 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1303 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1307 r = sd_id128_randomize(&rnd);
1309 return log_error_errno(r, "Failed to generate random boot id: %m");
1311 id128_format_as_uuid(rnd, as_uuid);
1313 r = write_string_file(from, as_uuid);
1315 return log_error_errno(r, "Failed to write boot id: %m");
1317 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1318 log_error_errno(errno, "Failed to bind mount boot id: %m");
1320 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1321 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1327 static int copy_devnodes(const char *dest) {
1329 static const char devnodes[] =
1340 _cleanup_umask_ mode_t u;
1346 NULSTR_FOREACH(d, devnodes) {
1347 _cleanup_free_ char *from = NULL, *to = NULL;
1350 from = strappend("/dev/", d);
1351 to = strjoin(dest, "/dev/", d, NULL);
1355 if (stat(from, &st) < 0) {
1357 if (errno != ENOENT)
1358 return log_error_errno(errno, "Failed to stat %s: %m", from);
1360 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1362 log_error("%s is not a char or block device, cannot copy", from);
1366 r = mkdir_parents(to, 0775);
1368 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1372 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1373 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1380 static int setup_ptmx(const char *dest) {
1381 _cleanup_free_ char *p = NULL;
1383 p = strappend(dest, "/dev/ptmx");
1387 if (symlink("pts/ptmx", p) < 0)
1388 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1393 static int setup_dev_console(const char *dest, const char *console) {
1394 _cleanup_umask_ mode_t u;
1404 if (stat("/dev/null", &st) < 0)
1405 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1407 r = chmod_and_chown(console, 0600, 0, 0);
1409 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1411 /* We need to bind mount the right tty to /dev/console since
1412 * ptys can only exist on pts file systems. To have something
1413 * to bind mount things on we create a device node first, and
1414 * use /dev/null for that since we the cgroups device policy
1415 * allows us to create that freely, while we cannot create
1416 * /dev/console. (Note that the major minor doesn't actually
1417 * matter here, since we mount it over anyway). */
1419 to = strappenda(dest, "/dev/console");
1420 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1421 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1423 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1424 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1429 static int setup_kmsg(const char *dest, int kmsg_socket) {
1430 _cleanup_free_ char *from = NULL, *to = NULL;
1431 _cleanup_umask_ mode_t u;
1434 struct cmsghdr cmsghdr;
1435 uint8_t buf[CMSG_SPACE(sizeof(int))];
1437 struct msghdr mh = {
1438 .msg_control = &control,
1439 .msg_controllen = sizeof(control),
1441 struct cmsghdr *cmsg;
1444 assert(kmsg_socket >= 0);
1448 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1449 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1450 * on the reading side behave very similar to /proc/kmsg,
1451 * their writing side behaves differently from /dev/kmsg in
1452 * that writing blocks when nothing is reading. In order to
1453 * avoid any problems with containers deadlocking due to this
1454 * we simply make /dev/kmsg unavailable to the container. */
1455 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1456 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1459 if (mkfifo(from, 0600) < 0)
1460 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1462 r = chmod_and_chown(from, 0600, 0, 0);
1464 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1466 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1467 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1469 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1471 return log_error_errno(errno, "Failed to open fifo: %m");
1473 cmsg = CMSG_FIRSTHDR(&mh);
1474 cmsg->cmsg_level = SOL_SOCKET;
1475 cmsg->cmsg_type = SCM_RIGHTS;
1476 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1477 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1479 mh.msg_controllen = cmsg->cmsg_len;
1481 /* Store away the fd in the socket, so that it stays open as
1482 * long as we run the child */
1483 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1487 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1489 /* And now make the FIFO unavailable as /dev/kmsg... */
1494 static int send_rtnl(int send_fd) {
1496 struct cmsghdr cmsghdr;
1497 uint8_t buf[CMSG_SPACE(sizeof(int))];
1499 struct msghdr mh = {
1500 .msg_control = &control,
1501 .msg_controllen = sizeof(control),
1503 struct cmsghdr *cmsg;
1504 _cleanup_close_ int fd = -1;
1507 assert(send_fd >= 0);
1509 if (!arg_expose_ports)
1512 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1514 return log_error_errno(errno, "failed to allocate container netlink: %m");
1516 cmsg = CMSG_FIRSTHDR(&mh);
1517 cmsg->cmsg_level = SOL_SOCKET;
1518 cmsg->cmsg_type = SCM_RIGHTS;
1519 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1520 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1522 mh.msg_controllen = cmsg->cmsg_len;
1524 /* Store away the fd in the socket, so that it stays open as
1525 * long as we run the child */
1526 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1528 return log_error_errno(errno, "Failed to send netlink fd: %m");
1533 static int flush_ports(union in_addr_union *exposed) {
1535 int r, af = AF_INET;
1539 if (!arg_expose_ports)
1542 if (in_addr_is_null(af, exposed))
1545 log_debug("Lost IP address.");
1547 LIST_FOREACH(ports, p, arg_expose_ports) {
1548 r = fw_add_local_dnat(false,
1559 log_warning_errno(r, "Failed to modify firewall: %m");
1562 *exposed = IN_ADDR_NULL;
1566 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1567 _cleanup_free_ struct local_address *addresses = NULL;
1568 _cleanup_free_ char *pretty = NULL;
1569 union in_addr_union new_exposed;
1572 int af = AF_INET, r;
1576 /* Invoked each time an address is added or removed inside the
1579 if (!arg_expose_ports)
1582 r = local_addresses(rtnl, 0, af, &addresses);
1584 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1587 addresses[0].family == af &&
1588 addresses[0].scope < RT_SCOPE_LINK;
1591 return flush_ports(exposed);
1593 new_exposed = addresses[0].address;
1594 if (in_addr_equal(af, exposed, &new_exposed))
1597 in_addr_to_string(af, &new_exposed, &pretty);
1598 log_debug("New container IP is %s.", strna(pretty));
1600 LIST_FOREACH(ports, p, arg_expose_ports) {
1602 r = fw_add_local_dnat(true,
1611 in_addr_is_null(af, exposed) ? NULL : exposed);
1613 log_warning_errno(r, "Failed to modify firewall: %m");
1616 *exposed = new_exposed;
1620 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1621 union in_addr_union *exposed = userdata;
1627 expose_ports(rtnl, exposed);
1631 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1633 struct cmsghdr cmsghdr;
1634 uint8_t buf[CMSG_SPACE(sizeof(int))];
1636 struct msghdr mh = {
1637 .msg_control = &control,
1638 .msg_controllen = sizeof(control),
1640 struct cmsghdr *cmsg;
1641 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1646 assert(recv_fd >= 0);
1649 if (!arg_expose_ports)
1652 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1654 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1656 cmsg = CMSG_FIRSTHDR(&mh);
1657 assert(cmsg->cmsg_level == SOL_SOCKET);
1658 assert(cmsg->cmsg_type == SCM_RIGHTS);
1659 assert(cmsg->cmsg_len = CMSG_LEN(sizeof(int)));
1660 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1662 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1665 return log_error_errno(r, "Failed to create rtnl object: %m");
1668 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1670 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1672 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1674 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1676 r = sd_rtnl_attach_event(rtnl, event, 0);
1678 return log_error_errno(r, "Failed to add to even loop: %m");
1686 static int setup_hostname(void) {
1688 if (arg_share_system)
1691 if (sethostname_idempotent(arg_machine) < 0)
1697 static int setup_journal(const char *directory) {
1698 sd_id128_t machine_id, this_id;
1699 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1703 /* Don't link journals in ephemeral mode */
1707 p = strappend(directory, "/etc/machine-id");
1711 r = read_one_line_file(p, &b);
1712 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1715 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1718 if (isempty(id) && arg_link_journal == LINK_AUTO)
1721 /* Verify validity */
1722 r = sd_id128_from_string(id, &machine_id);
1724 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1726 r = sd_id128_get_machine(&this_id);
1728 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1730 if (sd_id128_equal(machine_id, this_id)) {
1731 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1732 "Host and machine ids are equal (%s): refusing to link journals", id);
1733 if (arg_link_journal == LINK_AUTO)
1738 if (arg_link_journal == LINK_NO)
1742 p = strappend("/var/log/journal/", id);
1743 q = strjoin(directory, "/var/log/journal/", id, NULL);
1747 if (path_is_mount_point(p, false) > 0) {
1748 if (arg_link_journal != LINK_AUTO) {
1749 log_error("%s: already a mount point, refusing to use for journal", p);
1756 if (path_is_mount_point(q, false) > 0) {
1757 if (arg_link_journal != LINK_AUTO) {
1758 log_error("%s: already a mount point, refusing to use for journal", q);
1765 r = readlink_and_make_absolute(p, &d);
1767 if ((arg_link_journal == LINK_GUEST ||
1768 arg_link_journal == LINK_AUTO) &&
1771 r = mkdir_p(q, 0755);
1773 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1778 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1779 } else if (r == -EINVAL) {
1781 if (arg_link_journal == LINK_GUEST &&
1784 if (errno == ENOTDIR) {
1785 log_error("%s already exists and is neither a symlink nor a directory", p);
1788 log_error_errno(errno, "Failed to remove %s: %m", p);
1792 } else if (r != -ENOENT) {
1793 log_error_errno(errno, "readlink(%s) failed: %m", p);
1797 if (arg_link_journal == LINK_GUEST) {
1799 if (symlink(q, p) < 0) {
1800 if (arg_link_journal_try) {
1801 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1804 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1809 r = mkdir_p(q, 0755);
1811 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1815 if (arg_link_journal == LINK_HOST) {
1816 /* don't create parents here -- if the host doesn't have
1817 * permanent journal set up, don't force it here */
1820 if (arg_link_journal_try) {
1821 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1824 log_error_errno(errno, "Failed to create %s: %m", p);
1829 } else if (access(p, F_OK) < 0)
1832 if (dir_is_empty(q) == 0)
1833 log_warning("%s is not empty, proceeding anyway.", q);
1835 r = mkdir_p(q, 0755);
1837 log_error_errno(errno, "Failed to create %s: %m", q);
1841 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1842 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1847 static int drop_capabilities(void) {
1848 return capability_bounding_set_drop(~arg_retain, false);
1851 static int register_machine(pid_t pid, int local_ifindex) {
1852 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1853 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1859 r = sd_bus_default_system(&bus);
1861 return log_error_errno(r, "Failed to open system bus: %m");
1863 if (arg_keep_unit) {
1864 r = sd_bus_call_method(
1866 "org.freedesktop.machine1",
1867 "/org/freedesktop/machine1",
1868 "org.freedesktop.machine1.Manager",
1869 "RegisterMachineWithNetwork",
1874 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1878 strempty(arg_directory),
1879 local_ifindex > 0 ? 1 : 0, local_ifindex);
1881 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1883 r = sd_bus_message_new_method_call(
1886 "org.freedesktop.machine1",
1887 "/org/freedesktop/machine1",
1888 "org.freedesktop.machine1.Manager",
1889 "CreateMachineWithNetwork");
1891 return log_error_errno(r, "Failed to create message: %m");
1893 r = sd_bus_message_append(
1897 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1901 strempty(arg_directory),
1902 local_ifindex > 0 ? 1 : 0, local_ifindex);
1904 return log_error_errno(r, "Failed to append message arguments: %m");
1906 r = sd_bus_message_open_container(m, 'a', "(sv)");
1908 return log_error_errno(r, "Failed to open container: %m");
1910 if (!isempty(arg_slice)) {
1911 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1913 return log_error_errno(r, "Failed to append slice: %m");
1916 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1918 return log_error_errno(r, "Failed to add device policy: %m");
1920 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1921 /* Allow the container to
1922 * access and create the API
1923 * device nodes, so that
1924 * PrivateDevices= in the
1925 * container can work
1930 "/dev/random", "rwm",
1931 "/dev/urandom", "rwm",
1933 "/dev/net/tun", "rwm",
1934 /* Allow the container
1935 * access to ptys. However,
1937 * container to ever create
1938 * these device nodes. */
1939 "/dev/pts/ptmx", "rw",
1942 return log_error_errno(r, "Failed to add device whitelist: %m");
1944 r = sd_bus_message_close_container(m);
1946 return log_error_errno(r, "Failed to close container: %m");
1948 r = sd_bus_call(bus, m, 0, &error, NULL);
1952 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1959 static int terminate_machine(pid_t pid) {
1960 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1961 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1962 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1969 r = sd_bus_default_system(&bus);
1971 return log_error_errno(r, "Failed to open system bus: %m");
1973 r = sd_bus_call_method(
1975 "org.freedesktop.machine1",
1976 "/org/freedesktop/machine1",
1977 "org.freedesktop.machine1.Manager",
1984 /* Note that the machine might already have been
1985 * cleaned up automatically, hence don't consider it a
1986 * failure if we cannot get the machine object. */
1987 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1991 r = sd_bus_message_read(reply, "o", &path);
1993 return bus_log_parse_error(r);
1995 r = sd_bus_call_method(
1997 "org.freedesktop.machine1",
1999 "org.freedesktop.machine1.Machine",
2005 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2012 static int reset_audit_loginuid(void) {
2013 _cleanup_free_ char *p = NULL;
2016 if (arg_share_system)
2019 r = read_one_line_file("/proc/self/loginuid", &p);
2023 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2025 /* Already reset? */
2026 if (streq(p, "4294967295"))
2029 r = write_string_file("/proc/self/loginuid", "4294967295");
2031 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2032 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2033 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2034 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2035 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2043 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2044 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2045 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2047 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2053 l = strlen(arg_machine);
2054 sz = sizeof(sd_id128_t) + l;
2060 /* fetch some persistent data unique to the host */
2061 r = sd_id128_get_machine((sd_id128_t*) v);
2065 /* combine with some data unique (on this host) to this
2066 * container instance */
2067 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2070 memcpy(i, &idx, sizeof(idx));
2073 /* Let's hash the host machine ID plus the container name. We
2074 * use a fixed, but originally randomly created hash key here. */
2075 siphash24(result, v, sz, hash_key.bytes);
2077 assert_cc(ETH_ALEN <= sizeof(result));
2078 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2080 /* see eth_random_addr in the kernel */
2081 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2082 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2087 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2088 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2089 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2090 struct ether_addr mac_host, mac_container;
2093 if (!arg_private_network)
2096 if (!arg_network_veth)
2099 /* Use two different interface name prefixes depending whether
2100 * we are in bridge mode or not. */
2101 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2102 arg_network_bridge ? "vb" : "ve", arg_machine);
2104 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2106 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2108 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2110 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2112 r = sd_rtnl_open(&rtnl, 0);
2114 return log_error_errno(r, "Failed to connect to netlink: %m");
2116 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2118 return log_error_errno(r, "Failed to allocate netlink message: %m");
2120 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2122 return log_error_errno(r, "Failed to add netlink interface name: %m");
2124 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2126 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2128 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2130 return log_error_errno(r, "Failed to open netlink container: %m");
2132 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2134 return log_error_errno(r, "Failed to open netlink container: %m");
2136 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2138 return log_error_errno(r, "Failed to open netlink container: %m");
2140 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2142 return log_error_errno(r, "Failed to add netlink interface name: %m");
2144 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2146 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2148 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2150 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2152 r = sd_rtnl_message_close_container(m);
2154 return log_error_errno(r, "Failed to close netlink container: %m");
2156 r = sd_rtnl_message_close_container(m);
2158 return log_error_errno(r, "Failed to close netlink container: %m");
2160 r = sd_rtnl_message_close_container(m);
2162 return log_error_errno(r, "Failed to close netlink container: %m");
2164 r = sd_rtnl_call(rtnl, m, 0, NULL);
2166 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2168 i = (int) if_nametoindex(iface_name);
2170 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2177 static int setup_bridge(const char veth_name[], int *ifi) {
2178 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2179 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2182 if (!arg_private_network)
2185 if (!arg_network_veth)
2188 if (!arg_network_bridge)
2191 bridge = (int) if_nametoindex(arg_network_bridge);
2193 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2197 r = sd_rtnl_open(&rtnl, 0);
2199 return log_error_errno(r, "Failed to connect to netlink: %m");
2201 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2203 return log_error_errno(r, "Failed to allocate netlink message: %m");
2205 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2207 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2209 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2211 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2213 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2215 return log_error_errno(r, "Failed to add netlink master field: %m");
2217 r = sd_rtnl_call(rtnl, m, 0, NULL);
2219 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2224 static int parse_interface(struct udev *udev, const char *name) {
2225 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2226 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2229 ifi = (int) if_nametoindex(name);
2231 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2233 sprintf(ifi_str, "n%i", ifi);
2234 d = udev_device_new_from_device_id(udev, ifi_str);
2236 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2238 if (udev_device_get_is_initialized(d) <= 0) {
2239 log_error("Network interface %s is not initialized yet.", name);
2246 static int move_network_interfaces(pid_t pid) {
2247 _cleanup_udev_unref_ struct udev *udev = NULL;
2248 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2252 if (!arg_private_network)
2255 if (strv_isempty(arg_network_interfaces))
2258 r = sd_rtnl_open(&rtnl, 0);
2260 return log_error_errno(r, "Failed to connect to netlink: %m");
2264 log_error("Failed to connect to udev.");
2268 STRV_FOREACH(i, arg_network_interfaces) {
2269 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2272 ifi = parse_interface(udev, *i);
2276 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2278 return log_error_errno(r, "Failed to allocate netlink message: %m");
2280 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2282 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2284 r = sd_rtnl_call(rtnl, m, 0, NULL);
2286 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2292 static int setup_macvlan(pid_t pid) {
2293 _cleanup_udev_unref_ struct udev *udev = NULL;
2294 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2299 if (!arg_private_network)
2302 if (strv_isempty(arg_network_macvlan))
2305 r = sd_rtnl_open(&rtnl, 0);
2307 return log_error_errno(r, "Failed to connect to netlink: %m");
2311 log_error("Failed to connect to udev.");
2315 STRV_FOREACH(i, arg_network_macvlan) {
2316 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2317 _cleanup_free_ char *n = NULL;
2318 struct ether_addr mac;
2321 ifi = parse_interface(udev, *i);
2325 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2327 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2329 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2331 return log_error_errno(r, "Failed to allocate netlink message: %m");
2333 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2335 return log_error_errno(r, "Failed to add netlink interface index: %m");
2337 n = strappend("mv-", *i);
2341 strshorten(n, IFNAMSIZ-1);
2343 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2345 return log_error_errno(r, "Failed to add netlink interface name: %m");
2347 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2349 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2351 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2353 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2355 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2357 return log_error_errno(r, "Failed to open netlink container: %m");
2359 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2361 return log_error_errno(r, "Failed to open netlink container: %m");
2363 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2365 return log_error_errno(r, "Failed to append macvlan mode: %m");
2367 r = sd_rtnl_message_close_container(m);
2369 return log_error_errno(r, "Failed to close netlink container: %m");
2371 r = sd_rtnl_message_close_container(m);
2373 return log_error_errno(r, "Failed to close netlink container: %m");
2375 r = sd_rtnl_call(rtnl, m, 0, NULL);
2377 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2383 static int setup_seccomp(void) {
2386 static const int blacklist[] = {
2387 SCMP_SYS(kexec_load),
2388 SCMP_SYS(open_by_handle_at),
2389 SCMP_SYS(init_module),
2390 SCMP_SYS(finit_module),
2391 SCMP_SYS(delete_module),
2398 scmp_filter_ctx seccomp;
2402 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2406 r = seccomp_add_secondary_archs(seccomp);
2408 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2412 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2413 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2415 continue; /* unknown syscall */
2417 log_error_errno(r, "Failed to block syscall: %m");
2423 Audit is broken in containers, much of the userspace audit
2424 hookup will fail if running inside a container. We don't
2425 care and just turn off creation of audit sockets.
2427 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2428 with EAFNOSUPPORT which audit userspace uses as indication
2429 that audit is disabled in the kernel.
2432 r = seccomp_rule_add(
2434 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2437 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2438 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2440 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2444 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2446 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2450 r = seccomp_load(seccomp);
2452 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2455 seccomp_release(seccomp);
2463 static int setup_propagate(const char *root) {
2466 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2467 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2468 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2469 (void) mkdir_p(p, 0600);
2471 q = strappenda(root, "/run/systemd/nspawn/incoming");
2472 mkdir_parents(q, 0755);
2475 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2476 return log_error_errno(errno, "Failed to install propagation bind mount.");
2478 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2479 return log_error_errno(errno, "Failed to make propagation mount read-only");
2484 static int setup_image(char **device_path, int *loop_nr) {
2485 struct loop_info64 info = {
2486 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2488 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2489 _cleanup_free_ char* loopdev = NULL;
2493 assert(device_path);
2497 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2499 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2501 if (fstat(fd, &st) < 0)
2502 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2504 if (S_ISBLK(st.st_mode)) {
2507 p = strdup(arg_image);
2521 if (!S_ISREG(st.st_mode)) {
2522 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2526 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2528 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2530 nr = ioctl(control, LOOP_CTL_GET_FREE);
2532 return log_error_errno(errno, "Failed to allocate loop device: %m");
2534 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2537 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2539 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2541 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2542 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2545 info.lo_flags |= LO_FLAGS_READ_ONLY;
2547 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2548 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2550 *device_path = loopdev;
2561 static int dissect_image(
2563 char **root_device, bool *root_device_rw,
2564 char **home_device, bool *home_device_rw,
2565 char **srv_device, bool *srv_device_rw,
2569 int home_nr = -1, srv_nr = -1;
2570 #ifdef GPT_ROOT_NATIVE
2573 #ifdef GPT_ROOT_SECONDARY
2574 int secondary_root_nr = -1;
2577 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2578 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2579 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2580 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2581 _cleanup_udev_unref_ struct udev *udev = NULL;
2582 struct udev_list_entry *first, *item;
2583 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2584 const char *pttype = NULL;
2590 assert(root_device);
2591 assert(home_device);
2596 b = blkid_new_probe();
2601 r = blkid_probe_set_device(b, fd, 0, 0);
2606 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2610 blkid_probe_enable_partitions(b, 1);
2611 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2614 r = blkid_do_safeprobe(b);
2615 if (r == -2 || r == 1) {
2616 log_error("Failed to identify any partition table on %s.\n"
2617 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2619 } else if (r != 0) {
2622 log_error_errno(errno, "Failed to probe: %m");
2626 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2627 if (!streq_ptr(pttype, "gpt")) {
2628 log_error("Image %s does not carry a GUID Partition Table.\n"
2629 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2634 pl = blkid_probe_get_partitions(b);
2639 log_error("Failed to list partitions of %s", arg_image);
2647 if (fstat(fd, &st) < 0)
2648 return log_error_errno(errno, "Failed to stat block device: %m");
2650 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2654 e = udev_enumerate_new(udev);
2658 r = udev_enumerate_add_match_parent(e, d);
2662 r = udev_enumerate_scan_devices(e);
2664 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2666 first = udev_enumerate_get_list_entry(e);
2667 udev_list_entry_foreach(item, first) {
2668 _cleanup_udev_device_unref_ struct udev_device *q;
2669 const char *stype, *node;
2670 unsigned long long flags;
2677 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2682 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2686 qn = udev_device_get_devnum(q);
2690 if (st.st_rdev == qn)
2693 node = udev_device_get_devnode(q);
2697 pp = blkid_partlist_devno_to_partition(pl, qn);
2701 flags = blkid_partition_get_flags(pp);
2702 if (flags & GPT_FLAG_NO_AUTO)
2705 nr = blkid_partition_get_partno(pp);
2709 stype = blkid_partition_get_type_string(pp);
2713 if (sd_id128_from_string(stype, &type_id) < 0)
2716 if (sd_id128_equal(type_id, GPT_HOME)) {
2718 if (home && nr >= home_nr)
2722 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2725 home = strdup(node);
2728 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2730 if (srv && nr >= srv_nr)
2734 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2741 #ifdef GPT_ROOT_NATIVE
2742 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2744 if (root && nr >= root_nr)
2748 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2751 root = strdup(node);
2756 #ifdef GPT_ROOT_SECONDARY
2757 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2759 if (secondary_root && nr >= secondary_root_nr)
2762 secondary_root_nr = nr;
2763 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2766 free(secondary_root);
2767 secondary_root = strdup(node);
2768 if (!secondary_root)
2774 if (!root && !secondary_root) {
2775 log_error("Failed to identify root partition in disk image %s.\n"
2776 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2781 *root_device = root;
2784 *root_device_rw = root_rw;
2786 } else if (secondary_root) {
2787 *root_device = secondary_root;
2788 secondary_root = NULL;
2790 *root_device_rw = secondary_root_rw;
2795 *home_device = home;
2798 *home_device_rw = home_rw;
2805 *srv_device_rw = srv_rw;
2810 log_error("--image= is not supported, compiled without blkid support.");
2815 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2817 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2818 const char *fstype, *p;
2828 p = strappenda(where, directory);
2833 b = blkid_new_probe_from_filename(what);
2837 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2841 blkid_probe_enable_superblocks(b, 1);
2842 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2845 r = blkid_do_safeprobe(b);
2846 if (r == -1 || r == 1) {
2847 log_error("Cannot determine file system type of %s", what);
2849 } else if (r != 0) {
2852 log_error_errno(errno, "Failed to probe %s: %m", what);
2857 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2860 log_error("Failed to determine file system type of %s", what);
2864 if (streq(fstype, "crypto_LUKS")) {
2865 log_error("nspawn currently does not support LUKS disk images.");
2869 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2870 return log_error_errno(errno, "Failed to mount %s: %m", what);
2874 log_error("--image= is not supported, compiled without blkid support.");
2879 static int mount_devices(
2881 const char *root_device, bool root_device_rw,
2882 const char *home_device, bool home_device_rw,
2883 const char *srv_device, bool srv_device_rw) {
2889 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2891 return log_error_errno(r, "Failed to mount root directory: %m");
2895 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2897 return log_error_errno(r, "Failed to mount home directory: %m");
2901 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2903 return log_error_errno(r, "Failed to mount server data directory: %m");
2909 static void loop_remove(int nr, int *image_fd) {
2910 _cleanup_close_ int control = -1;
2916 if (image_fd && *image_fd >= 0) {
2917 r = ioctl(*image_fd, LOOP_CLR_FD);
2919 log_warning_errno(errno, "Failed to close loop image: %m");
2920 *image_fd = safe_close(*image_fd);
2923 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2925 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2929 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2931 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2934 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2942 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2943 return log_error_errno(errno, "Failed to allocate pipe: %m");
2947 return log_error_errno(errno, "Failed to fork getent child: %m");
2948 else if (pid == 0) {
2950 char *empty_env = NULL;
2952 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2953 _exit(EXIT_FAILURE);
2955 if (pipe_fds[0] > 2)
2956 safe_close(pipe_fds[0]);
2957 if (pipe_fds[1] > 2)
2958 safe_close(pipe_fds[1]);
2960 nullfd = open("/dev/null", O_RDWR);
2962 _exit(EXIT_FAILURE);
2964 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2965 _exit(EXIT_FAILURE);
2967 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2968 _exit(EXIT_FAILURE);
2973 reset_all_signal_handlers();
2974 close_all_fds(NULL, 0);
2976 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2977 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2978 _exit(EXIT_FAILURE);
2981 pipe_fds[1] = safe_close(pipe_fds[1]);
2988 static int change_uid_gid(char **_home) {
2989 char line[LINE_MAX], *x, *u, *g, *h;
2990 const char *word, *state;
2991 _cleanup_free_ uid_t *uids = NULL;
2992 _cleanup_free_ char *home = NULL;
2993 _cleanup_fclose_ FILE *f = NULL;
2994 _cleanup_close_ int fd = -1;
2995 unsigned n_uids = 0;
3004 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3005 /* Reset everything fully to 0, just in case */
3007 if (setgroups(0, NULL) < 0)
3008 return log_error_errno(errno, "setgroups() failed: %m");
3010 if (setresgid(0, 0, 0) < 0)
3011 return log_error_errno(errno, "setregid() failed: %m");
3013 if (setresuid(0, 0, 0) < 0)
3014 return log_error_errno(errno, "setreuid() failed: %m");
3020 /* First, get user credentials */
3021 fd = spawn_getent("passwd", arg_user, &pid);
3025 f = fdopen(fd, "r");
3030 if (!fgets(line, sizeof(line), f)) {
3033 log_error("Failed to resolve user %s.", arg_user);
3037 log_error_errno(errno, "Failed to read from getent: %m");
3043 wait_for_terminate_and_warn("getent passwd", pid, true);
3045 x = strchr(line, ':');
3047 log_error("/etc/passwd entry has invalid user field.");
3051 u = strchr(x+1, ':');
3053 log_error("/etc/passwd entry has invalid password field.");
3060 log_error("/etc/passwd entry has invalid UID field.");
3068 log_error("/etc/passwd entry has invalid GID field.");
3073 h = strchr(x+1, ':');
3075 log_error("/etc/passwd entry has invalid GECOS field.");
3082 log_error("/etc/passwd entry has invalid home directory field.");
3088 r = parse_uid(u, &uid);
3090 log_error("Failed to parse UID of user.");
3094 r = parse_gid(g, &gid);
3096 log_error("Failed to parse GID of user.");
3104 /* Second, get group memberships */
3105 fd = spawn_getent("initgroups", arg_user, &pid);
3110 f = fdopen(fd, "r");
3115 if (!fgets(line, sizeof(line), f)) {
3117 log_error("Failed to resolve user %s.", arg_user);
3121 log_error_errno(errno, "Failed to read from getent: %m");
3127 wait_for_terminate_and_warn("getent initgroups", pid, true);
3129 /* Skip over the username and subsequent separator whitespace */
3131 x += strcspn(x, WHITESPACE);
3132 x += strspn(x, WHITESPACE);
3134 FOREACH_WORD(word, l, x, state) {
3140 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3143 r = parse_uid(c, &uids[n_uids++]);
3145 log_error("Failed to parse group data from getent.");
3150 r = mkdir_parents(home, 0775);
3152 return log_error_errno(r, "Failed to make home root directory: %m");
3154 r = mkdir_safe(home, 0755, uid, gid);
3155 if (r < 0 && r != -EEXIST)
3156 return log_error_errno(r, "Failed to make home directory: %m");
3158 fchown(STDIN_FILENO, uid, gid);
3159 fchown(STDOUT_FILENO, uid, gid);
3160 fchown(STDERR_FILENO, uid, gid);
3162 if (setgroups(n_uids, uids) < 0)
3163 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3165 if (setresgid(gid, gid, gid) < 0)
3166 return log_error_errno(errno, "setregid() failed: %m");
3168 if (setresuid(uid, uid, uid) < 0)
3169 return log_error_errno(errno, "setreuid() failed: %m");
3181 * < 0 : wait_for_terminate() failed to get the state of the
3182 * container, the container was terminated by a signal, or
3183 * failed for an unknown reason. No change is made to the
3184 * container argument.
3185 * > 0 : The program executed in the container terminated with an
3186 * error. The exit code of the program executed in the
3187 * container is returned. The container argument has been set
3188 * to CONTAINER_TERMINATED.
3189 * 0 : The container is being rebooted, has been shut down or exited
3190 * successfully. The container argument has been set to either
3191 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3193 * That is, success is indicated by a return value of zero, and an
3194 * error is indicated by a non-zero value.
3196 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3200 r = wait_for_terminate(pid, &status);
3202 return log_warning_errno(r, "Failed to wait for container: %m");
3204 switch (status.si_code) {
3207 if (status.si_status == 0) {
3208 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3211 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3213 *container = CONTAINER_TERMINATED;
3214 return status.si_status;
3217 if (status.si_status == SIGINT) {
3219 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3220 *container = CONTAINER_TERMINATED;
3223 } else if (status.si_status == SIGHUP) {
3225 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3226 *container = CONTAINER_REBOOTED;
3230 /* CLD_KILLED fallthrough */
3233 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3237 log_error("Container %s failed due to unknown reason.", arg_machine);
3244 static void nop_handler(int sig) {}
3246 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3249 pid = PTR_TO_UINT32(userdata);
3251 if (kill(pid, SIGRTMIN+3) >= 0) {
3252 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3253 sd_event_source_set_userdata(s, NULL);
3258 sd_event_exit(sd_event_source_get_event(s), 0);
3262 static int determine_names(void) {
3265 if (!arg_image && !arg_directory) {
3267 _cleanup_(image_unrefp) Image *i = NULL;
3269 r = image_find(arg_machine, &i);
3271 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3273 log_error("No image for machine '%s': %m", arg_machine);
3277 if (i->type == IMAGE_GPT)
3278 r = set_sanitized_path(&arg_image, i->path);
3280 r = set_sanitized_path(&arg_directory, i->path);
3282 return log_error_errno(r, "Invalid image directory: %m");
3284 arg_read_only = arg_read_only || i->read_only;
3286 arg_directory = get_current_dir_name();
3288 if (!arg_directory && !arg_machine) {
3289 log_error("Failed to determine path, please use -D or -i.");
3295 if (arg_directory && path_equal(arg_directory, "/"))
3296 arg_machine = gethostname_malloc();
3298 arg_machine = strdup(basename(arg_image ?: arg_directory));
3303 hostname_cleanup(arg_machine, false);
3304 if (!machine_name_is_valid(arg_machine)) {
3305 log_error("Failed to determine machine name automatically, please use -M.");
3309 if (arg_ephemeral) {
3312 /* Add a random suffix when this is an
3313 * ephemeral machine, so that we can run many
3314 * instances at once without manually having
3315 * to specify -M each time. */
3317 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3328 int main(int argc, char *argv[]) {
3330 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3331 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3332 _cleanup_close_ int master = -1, image_fd = -1;
3333 _cleanup_fdset_free_ FDSet *fds = NULL;
3334 int r, n_fd_passed, loop_nr = -1;
3335 char veth_name[IFNAMSIZ];
3336 bool secondary = false, remove_subvol = false;
3337 sigset_t mask, mask_chld;
3339 int ret = EXIT_SUCCESS;
3340 union in_addr_union exposed = {};
3342 log_parse_environment();
3345 r = parse_argv(argc, argv);
3349 r = determine_names();
3353 if (geteuid() != 0) {
3354 log_error("Need to be root.");
3359 if (sd_booted() <= 0) {
3360 log_error("Not running on a systemd system.");
3366 n_fd_passed = sd_listen_fds(false);
3367 if (n_fd_passed > 0) {
3368 r = fdset_new_listen_fds(&fds, false);
3370 log_error_errno(r, "Failed to collect file descriptors: %m");
3374 fdset_close_others(fds);
3377 if (arg_directory) {
3380 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3381 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3387 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3390 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3392 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3396 log_info("Populated %s from template %s.", arg_directory, arg_template);
3399 } else if (arg_ephemeral) {
3402 /* If the specified path is a mount point we
3403 * generate the new snapshot immediately
3404 * inside it under a random name. However if
3405 * the specified is not a mount point we
3406 * create the new snapshot in the parent
3407 * directory, just next to it. */
3408 r = path_is_mount_point(arg_directory, false);
3410 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3414 r = tempfn_random_child(arg_directory, &np);
3416 r = tempfn_random(arg_directory, &np);
3418 log_error_errno(r, "Failed to generate name for snapshot: %m");
3422 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3425 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3429 free(arg_directory);
3432 remove_subvol = true;
3436 if (path_is_os_tree(arg_directory) <= 0) {
3437 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3444 p = strappenda(arg_directory,
3445 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3446 if (access(p, F_OK) < 0) {
3447 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3454 char template[] = "/tmp/nspawn-root-XXXXXX";
3457 assert(!arg_template);
3459 if (!mkdtemp(template)) {
3460 log_error_errno(errno, "Failed to create temporary directory: %m");
3465 arg_directory = strdup(template);
3466 if (!arg_directory) {
3471 image_fd = setup_image(&device_path, &loop_nr);
3477 r = dissect_image(image_fd,
3478 &root_device, &root_device_rw,
3479 &home_device, &home_device_rw,
3480 &srv_device, &srv_device_rw,
3486 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3488 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3492 r = ptsname_malloc(master, &console);
3494 r = log_error_errno(r, "Failed to determine tty name: %m");
3499 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3500 arg_machine, arg_image ?: arg_directory);
3502 if (unlockpt(master) < 0) {
3503 r = log_error_errno(errno, "Failed to unlock tty: %m");
3507 assert_se(sigemptyset(&mask) == 0);
3508 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3509 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3511 assert_se(sigemptyset(&mask_chld) == 0);
3512 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3515 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3516 ContainerStatus container_status;
3517 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3518 struct sigaction sa = {
3519 .sa_handler = nop_handler,
3520 .sa_flags = SA_NOCLDSTOP,
3523 r = barrier_create(&barrier);
3525 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3529 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3530 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3534 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3535 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3539 /* Child can be killed before execv(), so handle SIGCHLD
3540 * in order to interrupt parent's blocking calls and
3541 * give it a chance to call wait() and terminate. */
3542 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3544 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3548 r = sigaction(SIGCHLD, &sa, NULL);
3550 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3554 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3555 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3556 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3558 if (errno == EINVAL)
3559 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3561 r = log_error_errno(errno, "clone() failed: %m");
3568 _cleanup_free_ char *home = NULL;
3570 const char *envp[] = {
3571 "PATH=" DEFAULT_PATH_SPLIT_USR,
3572 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3577 NULL, /* container_uuid */
3578 NULL, /* LISTEN_FDS */
3579 NULL, /* LISTEN_PID */
3584 barrier_set_role(&barrier, BARRIER_CHILD);
3586 envp[n_env] = strv_find_prefix(environ, "TERM=");
3590 master = safe_close(master);
3592 close_nointr(STDIN_FILENO);
3593 close_nointr(STDOUT_FILENO);
3594 close_nointr(STDERR_FILENO);
3596 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3597 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3599 reset_all_signal_handlers();
3600 reset_signal_mask();
3602 r = open_terminal(console, O_RDWR);
3603 if (r != STDIN_FILENO) {
3609 log_error_errno(r, "Failed to open console: %m");
3610 _exit(EXIT_FAILURE);
3613 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3614 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3615 log_error_errno(errno, "Failed to duplicate console: %m");
3616 _exit(EXIT_FAILURE);
3620 log_error_errno(errno, "setsid() failed: %m");
3621 _exit(EXIT_FAILURE);
3624 if (reset_audit_loginuid() < 0)
3625 _exit(EXIT_FAILURE);
3627 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3628 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3629 _exit(EXIT_FAILURE);
3632 /* Mark everything as slave, so that we still
3633 * receive mounts from the real root, but don't
3634 * propagate mounts to the real root. */
3635 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3636 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3637 _exit(EXIT_FAILURE);
3640 if (mount_devices(arg_directory,
3641 root_device, root_device_rw,
3642 home_device, home_device_rw,
3643 srv_device, srv_device_rw) < 0)
3644 _exit(EXIT_FAILURE);
3646 /* Turn directory into bind mount */
3647 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3648 log_error_errno(errno, "Failed to make bind mount: %m");
3649 _exit(EXIT_FAILURE);
3652 r = setup_volatile(arg_directory);
3654 _exit(EXIT_FAILURE);
3656 if (setup_volatile_state(arg_directory) < 0)
3657 _exit(EXIT_FAILURE);
3659 r = base_filesystem_create(arg_directory);
3661 _exit(EXIT_FAILURE);
3663 if (arg_read_only) {
3664 r = bind_remount_recursive(arg_directory, true);
3666 log_error_errno(r, "Failed to make tree read-only: %m");
3667 _exit(EXIT_FAILURE);
3671 if (mount_all(arg_directory) < 0)
3672 _exit(EXIT_FAILURE);
3674 if (copy_devnodes(arg_directory) < 0)
3675 _exit(EXIT_FAILURE);
3677 if (setup_ptmx(arg_directory) < 0)
3678 _exit(EXIT_FAILURE);
3680 dev_setup(arg_directory);
3682 if (setup_propagate(arg_directory) < 0)
3683 _exit(EXIT_FAILURE);
3685 if (setup_seccomp() < 0)
3686 _exit(EXIT_FAILURE);
3688 if (setup_dev_console(arg_directory, console) < 0)
3689 _exit(EXIT_FAILURE);
3691 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3692 _exit(EXIT_FAILURE);
3693 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3695 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3696 _exit(EXIT_FAILURE);
3697 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3699 /* Tell the parent that we are ready, and that
3700 * it can cgroupify us to that we lack access
3701 * to certain devices and resources. */
3702 (void) barrier_place(&barrier);
3704 if (setup_boot_id(arg_directory) < 0)
3705 _exit(EXIT_FAILURE);
3707 if (setup_timezone(arg_directory) < 0)
3708 _exit(EXIT_FAILURE);
3710 if (setup_resolv_conf(arg_directory) < 0)
3711 _exit(EXIT_FAILURE);
3713 if (setup_journal(arg_directory) < 0)
3714 _exit(EXIT_FAILURE);
3716 if (mount_binds(arg_directory, arg_bind, false) < 0)
3717 _exit(EXIT_FAILURE);
3719 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3720 _exit(EXIT_FAILURE);
3722 if (mount_tmpfs(arg_directory) < 0)
3723 _exit(EXIT_FAILURE);
3725 /* Wait until we are cgroup-ified, so that we
3726 * can mount the right cgroup path writable */
3727 (void) barrier_sync_next(&barrier);
3729 if (mount_cgroup(arg_directory) < 0)
3730 _exit(EXIT_FAILURE);
3732 if (chdir(arg_directory) < 0) {
3733 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3734 _exit(EXIT_FAILURE);
3737 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3738 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3739 _exit(EXIT_FAILURE);
3742 if (chroot(".") < 0) {
3743 log_error_errno(errno, "chroot() failed: %m");
3744 _exit(EXIT_FAILURE);
3747 if (chdir("/") < 0) {
3748 log_error_errno(errno, "chdir() failed: %m");
3749 _exit(EXIT_FAILURE);
3754 if (arg_private_network)
3757 if (drop_capabilities() < 0) {
3758 log_error_errno(errno, "drop_capabilities() failed: %m");
3759 _exit(EXIT_FAILURE);
3762 r = change_uid_gid(&home);
3764 _exit(EXIT_FAILURE);
3766 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3767 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3768 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3770 _exit(EXIT_FAILURE);
3773 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3776 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3778 _exit(EXIT_FAILURE);
3782 if (fdset_size(fds) > 0) {
3783 r = fdset_cloexec(fds, false);
3785 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3786 _exit(EXIT_FAILURE);
3789 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3790 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3792 _exit(EXIT_FAILURE);
3798 if (arg_personality != 0xffffffffLU) {
3799 if (personality(arg_personality) < 0) {
3800 log_error_errno(errno, "personality() failed: %m");
3801 _exit(EXIT_FAILURE);
3803 } else if (secondary) {
3804 if (personality(PER_LINUX32) < 0) {
3805 log_error_errno(errno, "personality() failed: %m");
3806 _exit(EXIT_FAILURE);
3811 if (arg_selinux_context)
3812 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3813 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3814 _exit(EXIT_FAILURE);
3818 if (!strv_isempty(arg_setenv)) {
3821 n = strv_env_merge(2, envp, arg_setenv);
3824 _exit(EXIT_FAILURE);
3829 env_use = (char**) envp;
3831 /* Wait until the parent is ready with the setup, too... */
3832 if (!barrier_place_and_sync(&barrier))
3833 _exit(EXIT_FAILURE);
3839 /* Automatically search for the init system */
3841 l = 1 + argc - optind;
3842 a = newa(char*, l + 1);
3843 memcpy(a + 1, argv + optind, l * sizeof(char*));
3845 a[0] = (char*) "/usr/lib/systemd/systemd";
3846 execve(a[0], a, env_use);
3848 a[0] = (char*) "/lib/systemd/systemd";
3849 execve(a[0], a, env_use);
3851 a[0] = (char*) "/sbin/init";
3852 execve(a[0], a, env_use);
3853 } else if (argc > optind)
3854 execvpe(argv[optind], argv + optind, env_use);
3856 chdir(home ? home : "/root");
3857 execle("/bin/bash", "-bash", NULL, env_use);
3858 execle("/bin/sh", "-sh", NULL, env_use);
3861 log_error_errno(errno, "execv() failed: %m");
3862 _exit(EXIT_FAILURE);
3865 barrier_set_role(&barrier, BARRIER_PARENT);
3869 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3870 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3872 /* Wait for the most basic Child-setup to be done,
3873 * before we add hardware to it, and place it in a
3875 if (barrier_sync_next(&barrier)) {
3878 r = move_network_interfaces(pid);
3882 r = setup_veth(pid, veth_name, &ifi);
3886 r = setup_bridge(veth_name, &ifi);
3890 r = setup_macvlan(pid);
3894 r = register_machine(pid, ifi);
3898 /* Block SIGCHLD here, before notifying child.
3899 * process_pty() will handle it with the other signals. */
3900 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3904 /* Reset signal to default */
3905 r = default_signals(SIGCHLD, -1);
3909 /* Notify the child that the parent is ready with all
3910 * its setup, and that the child can now hand over
3911 * control to the code to run inside the container. */
3912 (void) barrier_place(&barrier);
3914 /* And wait that the child is completely ready now. */
3915 if (barrier_place_and_sync(&barrier)) {
3916 _cleanup_event_unref_ sd_event *event = NULL;
3917 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3918 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
3923 "STATUS=Container running.");
3925 r = sd_event_new(&event);
3927 log_error_errno(r, "Failed to get default event source: %m");
3932 /* Try to kill the init system on SIGINT or SIGTERM */
3933 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3934 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3936 /* Immediately exit */
3937 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3938 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3941 /* simply exit on sigchld */
3942 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3944 if (arg_expose_ports) {
3945 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
3949 (void) expose_ports(rtnl, &exposed);
3952 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3954 r = pty_forward_new(event, master, true, &forward);
3956 log_error_errno(r, "Failed to create PTY forwarder: %m");
3960 r = sd_event_loop(event);
3962 log_error_errno(r, "Failed to run event loop: %m");
3966 pty_forward_get_last_char(forward, &last_char);
3968 forward = pty_forward_free(forward);
3970 if (!arg_quiet && last_char != '\n')
3973 /* Kill if it is not dead yet anyway */
3974 terminate_machine(pid);
3978 /* Normally redundant, but better safe than sorry */
3981 r = wait_for_container(pid, &container_status);
3985 /* We failed to wait for the container, or the
3986 * container exited abnormally */
3988 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3989 /* The container exited with a non-zero
3990 * status, or with zero status and no reboot
3996 /* CONTAINER_REBOOTED, loop again */
3998 if (arg_keep_unit) {
3999 /* Special handling if we are running as a
4000 * service: instead of simply restarting the
4001 * machine we want to restart the entire
4002 * service, so let's inform systemd about this
4003 * with the special exit code 133. The service
4004 * file uses RestartForceExitStatus=133 so
4005 * that this results in a full nspawn
4006 * restart. This is necessary since we might
4007 * have cgroup parameters set we want to have
4014 flush_ports(&exposed);
4020 "STATUS=Terminating...");
4022 loop_remove(loop_nr, &image_fd);
4027 if (remove_subvol && arg_directory) {
4030 k = btrfs_subvol_remove(arg_directory);
4032 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4038 p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
4039 (void) rm_rf(p, false, true, false);
4042 free(arg_directory);
4047 strv_free(arg_setenv);
4048 strv_free(arg_network_interfaces);
4049 strv_free(arg_network_macvlan);
4050 strv_free(arg_bind);
4051 strv_free(arg_bind_ro);
4052 strv_free(arg_tmpfs);
4054 flush_ports(&exposed);
4056 while (arg_expose_ports) {
4057 ExposePort *p = arg_expose_ports;
4058 LIST_REMOVE(ports, arg_expose_ports, p);
4062 return r < 0 ? EXIT_FAILURE : ret;