1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
36 #include <sys/signalfd.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
89 #include "siphash24.h"
91 #include "base-filesystem.h"
93 #include "event-util.h"
94 #include "capability.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
99 #include "in-addr-util.h"
101 #include "local-addresses.h"
104 #include "seccomp-util.h"
107 typedef struct ExposePort {
110 uint16_t container_port;
111 LIST_FIELDS(struct ExposePort, ports);
114 typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
119 typedef enum LinkJournal {
126 typedef enum Volatile {
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147 (1ULL << CAP_CHOWN) |
148 (1ULL << CAP_DAC_OVERRIDE) |
149 (1ULL << CAP_DAC_READ_SEARCH) |
150 (1ULL << CAP_FOWNER) |
151 (1ULL << CAP_FSETID) |
152 (1ULL << CAP_IPC_OWNER) |
154 (1ULL << CAP_LEASE) |
155 (1ULL << CAP_LINUX_IMMUTABLE) |
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
159 (1ULL << CAP_SETGID) |
160 (1ULL << CAP_SETFCAP) |
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
164 (1ULL << CAP_SYS_CHROOT) |
165 (1ULL << CAP_SYS_NICE) |
166 (1ULL << CAP_SYS_PTRACE) |
167 (1ULL << CAP_SYS_TTY_CONFIG) |
168 (1ULL << CAP_SYS_RESOURCE) |
169 (1ULL << CAP_SYS_BOOT) |
170 (1ULL << CAP_AUDIT_WRITE) |
171 (1ULL << CAP_AUDIT_CONTROL) |
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static bool arg_network_veth = false;
184 static const char *arg_network_bridge = NULL;
185 static unsigned long arg_personality = 0xffffffffLU;
186 static char *arg_image = NULL;
187 static Volatile arg_volatile = VOLATILE_NO;
188 static ExposePort *arg_expose_ports = NULL;
190 static void help(void) {
191 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
192 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
193 " -h --help Show this help\n"
194 " --version Print version string\n"
195 " -q --quiet Do not show status information\n"
196 " -D --directory=PATH Root directory for the container\n"
197 " --template=PATH Initialize root directory from template directory,\n"
199 " -x --ephemeral Run container with snapshot of root directory, and\n"
200 " remove it after exit\n"
201 " -i --image=PATH File system device or disk image for the container\n"
202 " -b --boot Boot up full system (i.e. invoke init)\n"
203 " -u --user=USER Run the command under specified user or uid\n"
204 " -M --machine=NAME Set the machine name for the container\n"
205 " --uuid=UUID Set a specific machine UUID for the container\n"
206 " -S --slice=SLICE Place the container in the specified slice\n"
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " -n --network-veth Add a virtual ethernet connection between host\n"
216 " --network-bridge=INTERFACE\n"
217 " Add a virtual ethernet connection between host\n"
218 " and container and add it to an existing bridge on\n"
220 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
221 " Expose a container IP port on the host\n"
222 " -Z --selinux-context=SECLABEL\n"
223 " Set the SELinux security context to be used by\n"
224 " processes in the container\n"
225 " -L --selinux-apifs-context=SECLABEL\n"
226 " Set the SELinux security context to be used by\n"
227 " API/tmpfs file systems in the container\n"
228 " --capability=CAP In addition to the default, retain specified\n"
230 " --drop-capability=CAP Drop the specified capability from the default set\n"
231 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
232 " try-guest, try-host\n"
233 " -j Equivalent to --link-journal=try-guest\n"
234 " --read-only Mount the root directory read-only\n"
235 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
237 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
238 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
239 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
240 " --share-system Share system namespaces with host\n"
241 " --register=BOOLEAN Register container as machine\n"
242 " --keep-unit Do not register a scope for the machine, reuse\n"
243 " the service unit nspawn is running in\n"
244 " --volatile[=MODE] Run the system in volatile mode\n"
245 , program_invocation_short_name);
248 static int set_sanitized_path(char **b, const char *path) {
254 p = canonicalize_file_name(path);
259 p = path_make_absolute_cwd(path);
265 *b = path_kill_slashes(p);
269 static int parse_argv(int argc, char *argv[]) {
286 ARG_NETWORK_INTERFACE,
294 static const struct option options[] = {
295 { "help", no_argument, NULL, 'h' },
296 { "version", no_argument, NULL, ARG_VERSION },
297 { "directory", required_argument, NULL, 'D' },
298 { "template", required_argument, NULL, ARG_TEMPLATE },
299 { "ephemeral", no_argument, NULL, 'x' },
300 { "user", required_argument, NULL, 'u' },
301 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
302 { "boot", no_argument, NULL, 'b' },
303 { "uuid", required_argument, NULL, ARG_UUID },
304 { "read-only", no_argument, NULL, ARG_READ_ONLY },
305 { "capability", required_argument, NULL, ARG_CAPABILITY },
306 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
307 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
308 { "bind", required_argument, NULL, ARG_BIND },
309 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
310 { "tmpfs", required_argument, NULL, ARG_TMPFS },
311 { "machine", required_argument, NULL, 'M' },
312 { "slice", required_argument, NULL, 'S' },
313 { "setenv", required_argument, NULL, ARG_SETENV },
314 { "selinux-context", required_argument, NULL, 'Z' },
315 { "selinux-apifs-context", required_argument, NULL, 'L' },
316 { "quiet", no_argument, NULL, 'q' },
317 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
318 { "register", required_argument, NULL, ARG_REGISTER },
319 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
320 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
321 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
322 { "network-veth", no_argument, NULL, 'n' },
323 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
324 { "personality", required_argument, NULL, ARG_PERSONALITY },
325 { "image", required_argument, NULL, 'i' },
326 { "volatile", optional_argument, NULL, ARG_VOLATILE },
327 { "port", required_argument, NULL, 'p' },
332 uint64_t plus = 0, minus = 0;
337 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
346 puts(PACKAGE_STRING);
347 puts(SYSTEMD_FEATURES);
351 r = set_sanitized_path(&arg_directory, optarg);
353 return log_error_errno(r, "Invalid root directory: %m");
358 r = set_sanitized_path(&arg_template, optarg);
360 return log_error_errno(r, "Invalid template directory: %m");
365 r = set_sanitized_path(&arg_image, optarg);
367 return log_error_errno(r, "Invalid image path: %m");
372 arg_ephemeral = true;
377 arg_user = strdup(optarg);
383 case ARG_NETWORK_BRIDGE:
384 arg_network_bridge = optarg;
389 arg_network_veth = true;
390 arg_private_network = true;
393 case ARG_NETWORK_INTERFACE:
394 if (strv_extend(&arg_network_interfaces, optarg) < 0)
397 arg_private_network = true;
400 case ARG_NETWORK_MACVLAN:
401 if (strv_extend(&arg_network_macvlan, optarg) < 0)
406 case ARG_PRIVATE_NETWORK:
407 arg_private_network = true;
415 r = sd_id128_from_string(optarg, &arg_uuid);
417 log_error("Invalid UUID: %s", optarg);
427 if (isempty(optarg)) {
431 if (!machine_name_is_valid(optarg)) {
432 log_error("Invalid machine name: %s", optarg);
436 r = free_and_strdup(&arg_machine, optarg);
444 arg_selinux_context = optarg;
448 arg_selinux_apifs_context = optarg;
452 arg_read_only = true;
456 case ARG_DROP_CAPABILITY: {
457 const char *state, *word;
460 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
461 _cleanup_free_ char *t;
463 t = strndup(word, length);
467 if (streq(t, "all")) {
468 if (c == ARG_CAPABILITY)
469 plus = (uint64_t) -1;
471 minus = (uint64_t) -1;
475 cap = capability_from_name(t);
477 log_error("Failed to parse capability %s.", t);
481 if (c == ARG_CAPABILITY)
482 plus |= 1ULL << (uint64_t) cap;
484 minus |= 1ULL << (uint64_t) cap;
492 arg_link_journal = LINK_GUEST;
493 arg_link_journal_try = true;
496 case ARG_LINK_JOURNAL:
497 if (streq(optarg, "auto")) {
498 arg_link_journal = LINK_AUTO;
499 arg_link_journal_try = false;
500 } else if (streq(optarg, "no")) {
501 arg_link_journal = LINK_NO;
502 arg_link_journal_try = false;
503 } else if (streq(optarg, "guest")) {
504 arg_link_journal = LINK_GUEST;
505 arg_link_journal_try = false;
506 } else if (streq(optarg, "host")) {
507 arg_link_journal = LINK_HOST;
508 arg_link_journal_try = false;
509 } else if (streq(optarg, "try-guest")) {
510 arg_link_journal = LINK_GUEST;
511 arg_link_journal_try = true;
512 } else if (streq(optarg, "try-host")) {
513 arg_link_journal = LINK_HOST;
514 arg_link_journal_try = true;
516 log_error("Failed to parse link journal mode %s", optarg);
524 _cleanup_free_ char *a = NULL, *b = NULL;
528 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
530 e = strchr(optarg, ':');
532 a = strndup(optarg, e - optarg);
542 if (!path_is_absolute(a) || !path_is_absolute(b)) {
543 log_error("Invalid bind mount specification: %s", optarg);
547 r = strv_extend(x, a);
551 r = strv_extend(x, b);
559 _cleanup_free_ char *a = NULL, *b = NULL;
562 e = strchr(optarg, ':');
564 a = strndup(optarg, e - optarg);
568 b = strdup("mode=0755");
574 if (!path_is_absolute(a)) {
575 log_error("Invalid tmpfs specification: %s", optarg);
579 r = strv_push(&arg_tmpfs, a);
585 r = strv_push(&arg_tmpfs, b);
597 if (!env_assignment_is_valid(optarg)) {
598 log_error("Environment variable assignment '%s' is not valid.", optarg);
602 n = strv_env_set(arg_setenv, optarg);
606 strv_free(arg_setenv);
615 case ARG_SHARE_SYSTEM:
616 arg_share_system = true;
620 r = parse_boolean(optarg);
622 log_error("Failed to parse --register= argument: %s", optarg);
630 arg_keep_unit = true;
633 case ARG_PERSONALITY:
635 arg_personality = personality_from_string(optarg);
636 if (arg_personality == 0xffffffffLU) {
637 log_error("Unknown or unsupported personality '%s'.", optarg);
646 arg_volatile = VOLATILE_YES;
648 r = parse_boolean(optarg);
650 if (streq(optarg, "state"))
651 arg_volatile = VOLATILE_STATE;
653 log_error("Failed to parse --volatile= argument: %s", optarg);
657 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
663 const char *split, *e;
664 uint16_t container_port, host_port;
668 if ((e = startswith(optarg, "tcp:")))
669 protocol = IPPROTO_TCP;
670 else if ((e = startswith(optarg, "udp:")))
671 protocol = IPPROTO_UDP;
674 protocol = IPPROTO_TCP;
677 split = strchr(e, ':');
679 char v[split - e + 1];
681 memcpy(v, e, split - e);
684 r = safe_atou16(v, &host_port);
685 if (r < 0 || host_port <= 0) {
686 log_error("Failed to parse host port: %s", optarg);
690 r = safe_atou16(split + 1, &container_port);
692 r = safe_atou16(e, &container_port);
693 host_port = container_port;
696 if (r < 0 || container_port <= 0) {
697 log_error("Failed to parse host port: %s", optarg);
701 LIST_FOREACH(ports, p, arg_expose_ports) {
702 if (p->protocol == protocol && p->host_port == host_port) {
703 log_error("Duplicate port specification: %s", optarg);
708 p = new(ExposePort, 1);
712 p->protocol = protocol;
713 p->host_port = host_port;
714 p->container_port = container_port;
716 LIST_PREPEND(ports, arg_expose_ports, p);
725 assert_not_reached("Unhandled option");
728 if (arg_share_system)
729 arg_register = false;
731 if (arg_boot && arg_share_system) {
732 log_error("--boot and --share-system may not be combined.");
736 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
737 log_error("--keep-unit may not be used when invoked from a user session.");
741 if (arg_directory && arg_image) {
742 log_error("--directory= and --image= may not be combined.");
746 if (arg_template && arg_image) {
747 log_error("--template= and --image= may not be combined.");
751 if (arg_template && !(arg_directory || arg_machine)) {
752 log_error("--template= needs --directory= or --machine=.");
756 if (arg_ephemeral && arg_template) {
757 log_error("--ephemeral and --template= may not be combined.");
761 if (arg_ephemeral && arg_image) {
762 log_error("--ephemeral and --image= may not be combined.");
766 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
767 log_error("--ephemeral and --link-journal= may not be combined.");
771 if (arg_volatile != VOLATILE_NO && arg_read_only) {
772 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
776 if (arg_expose_ports && !arg_private_network) {
777 log_error("Cannot use --port= without private networking.");
781 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
786 static int mount_all(const char *dest) {
788 typedef struct MountPoint {
797 static const MountPoint mount_table[] = {
798 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
799 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
800 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
801 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
802 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
803 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
804 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
805 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
807 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
808 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
815 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
816 _cleanup_free_ char *where = NULL;
818 _cleanup_free_ char *options = NULL;
823 where = strjoin(dest, "/", mount_table[k].where, NULL);
827 t = path_is_mount_point(where, true);
829 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
837 /* Skip this entry if it is not a remount. */
838 if (mount_table[k].what && t > 0)
841 t = mkdir_p(where, 0755);
843 if (mount_table[k].fatal) {
844 log_error_errno(t, "Failed to create directory %s: %m", where);
849 log_warning_errno(t, "Failed to create directory %s: %m", where);
855 if (arg_selinux_apifs_context &&
856 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
857 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
864 o = mount_table[k].options;
867 if (mount(mount_table[k].what,
870 mount_table[k].flags,
873 if (mount_table[k].fatal) {
874 log_error_errno(errno, "mount(%s) failed: %m", where);
879 log_warning_errno(errno, "mount(%s) failed: %m", where);
886 static int mount_binds(const char *dest, char **l, bool ro) {
889 STRV_FOREACH_PAIR(x, y, l) {
890 _cleanup_free_ char *where = NULL;
891 struct stat source_st, dest_st;
894 if (stat(*x, &source_st) < 0)
895 return log_error_errno(errno, "Failed to stat %s: %m", *x);
897 where = strappend(dest, *y);
901 r = stat(where, &dest_st);
903 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
904 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
907 } else if (errno == ENOENT) {
908 r = mkdir_parents_label(where, 0755);
910 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
912 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
916 /* Create the mount point, but be conservative -- refuse to create block
917 * and char devices. */
918 if (S_ISDIR(source_st.st_mode)) {
919 r = mkdir_label(where, 0755);
920 if (r < 0 && errno != EEXIST)
921 return log_error_errno(r, "Failed to create mount point %s: %m", where);
922 } else if (S_ISFIFO(source_st.st_mode)) {
923 r = mkfifo(where, 0644);
924 if (r < 0 && errno != EEXIST)
925 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
926 } else if (S_ISSOCK(source_st.st_mode)) {
927 r = mknod(where, 0644 | S_IFSOCK, 0);
928 if (r < 0 && errno != EEXIST)
929 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
930 } else if (S_ISREG(source_st.st_mode)) {
933 return log_error_errno(r, "Failed to create mount point %s: %m", where);
935 log_error("Refusing to create mountpoint for file: %s", *x);
939 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
940 return log_error_errno(errno, "mount(%s) failed: %m", where);
943 r = bind_remount_recursive(where, true);
945 return log_error_errno(r, "Read-Only bind mount failed: %m");
952 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
956 to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
958 r = path_is_mount_point(to, false);
960 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
966 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
967 return log_error_errno(errno, "Failed to mount to %s: %m", to);
972 static int mount_cgroup(const char *dest) {
973 _cleanup_set_free_free_ Set *controllers = NULL;
974 _cleanup_free_ char *own_cgroup_path = NULL;
975 const char *cgroup_root, *systemd_root, *systemd_own;
978 controllers = set_new(&string_hash_ops);
982 r = cg_kernel_controllers(controllers);
984 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
986 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
988 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
990 cgroup_root = strappenda(dest, "/sys/fs/cgroup");
991 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
992 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
995 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
997 controller = set_steal_first(controllers);
1001 origin = strappend("/sys/fs/cgroup/", controller);
1005 r = readlink_malloc(origin, &combined);
1007 /* Not a symbolic link, but directly a single cgroup hierarchy */
1009 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1014 return log_error_errno(r, "Failed to read link %s: %m", origin);
1016 _cleanup_free_ char *target = NULL;
1018 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1022 /* A symbolic link, a combination of controllers in one hierarchy */
1024 if (!filename_is_valid(combined)) {
1025 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1029 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1033 if (symlink(combined, target) < 0)
1034 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1038 r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1042 /* Make our own cgroup a (writable) bind mount */
1043 systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1044 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1045 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1047 /* And then remount the systemd cgroup root read-only */
1048 systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1049 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1050 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1052 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1053 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1058 static int mount_tmpfs(const char *dest) {
1061 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1062 _cleanup_free_ char *where = NULL;
1065 where = strappend(dest, *i);
1069 r = mkdir_label(where, 0755);
1070 if (r < 0 && r != -EEXIST)
1071 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1073 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1074 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1080 static int setup_timezone(const char *dest) {
1081 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1087 /* Fix the timezone, if possible */
1088 r = readlink_malloc("/etc/localtime", &p);
1090 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1094 z = path_startswith(p, "../usr/share/zoneinfo/");
1096 z = path_startswith(p, "/usr/share/zoneinfo/");
1098 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1102 where = strappend(dest, "/etc/localtime");
1106 r = readlink_malloc(where, &q);
1108 y = path_startswith(q, "../usr/share/zoneinfo/");
1110 y = path_startswith(q, "/usr/share/zoneinfo/");
1112 /* Already pointing to the right place? Then do nothing .. */
1113 if (y && streq(y, z))
1117 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1121 if (access(check, F_OK) < 0) {
1122 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1126 what = strappend("../usr/share/zoneinfo/", z);
1130 r = mkdir_parents(where, 0755);
1132 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1138 if (r < 0 && errno != ENOENT) {
1139 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1144 if (symlink(what, where) < 0) {
1145 log_error_errno(errno, "Failed to correct timezone of container: %m");
1152 static int setup_resolv_conf(const char *dest) {
1153 _cleanup_free_ char *where = NULL;
1158 if (arg_private_network)
1161 /* Fix resolv.conf, if possible */
1162 where = strappend(dest, "/etc/resolv.conf");
1166 /* We don't really care for the results of this really. If it
1167 * fails, it fails, but meh... */
1168 r = mkdir_parents(where, 0755);
1170 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1175 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1177 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1185 static int setup_volatile_state(const char *directory) {
1191 if (arg_volatile != VOLATILE_STATE)
1194 /* --volatile=state means we simply overmount /var
1195 with a tmpfs, and the rest read-only. */
1197 r = bind_remount_recursive(directory, true);
1199 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1201 p = strappenda(directory, "/var");
1203 if (r < 0 && errno != EEXIST)
1204 return log_error_errno(errno, "Failed to create %s: %m", directory);
1206 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1207 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1212 static int setup_volatile(const char *directory) {
1213 bool tmpfs_mounted = false, bind_mounted = false;
1214 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1220 if (arg_volatile != VOLATILE_YES)
1223 /* --volatile=yes means we mount a tmpfs to the root dir, and
1224 the original /usr to use inside it, and that read-only. */
1226 if (!mkdtemp(template))
1227 return log_error_errno(errno, "Failed to create temporary directory: %m");
1229 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1230 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1235 tmpfs_mounted = true;
1237 f = strappenda(directory, "/usr");
1238 t = strappenda(template, "/usr");
1241 if (r < 0 && errno != EEXIST) {
1242 log_error_errno(errno, "Failed to create %s: %m", t);
1247 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1248 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1253 bind_mounted = true;
1255 r = bind_remount_recursive(t, true);
1257 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1261 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1262 log_error_errno(errno, "Failed to move root mount: %m");
1280 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1283 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1284 SD_ID128_FORMAT_VAL(id));
1289 static int setup_boot_id(const char *dest) {
1290 _cleanup_free_ char *from = NULL, *to = NULL;
1291 sd_id128_t rnd = {};
1297 if (arg_share_system)
1300 /* Generate a new randomized boot ID, so that each boot-up of
1301 * the container gets a new one */
1303 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1304 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1308 r = sd_id128_randomize(&rnd);
1310 return log_error_errno(r, "Failed to generate random boot id: %m");
1312 id128_format_as_uuid(rnd, as_uuid);
1314 r = write_string_file(from, as_uuid);
1316 return log_error_errno(r, "Failed to write boot id: %m");
1318 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1319 log_error_errno(errno, "Failed to bind mount boot id: %m");
1321 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1322 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1328 static int copy_devnodes(const char *dest) {
1330 static const char devnodes[] =
1341 _cleanup_umask_ mode_t u;
1347 NULSTR_FOREACH(d, devnodes) {
1348 _cleanup_free_ char *from = NULL, *to = NULL;
1351 from = strappend("/dev/", d);
1352 to = strjoin(dest, "/dev/", d, NULL);
1356 if (stat(from, &st) < 0) {
1358 if (errno != ENOENT)
1359 return log_error_errno(errno, "Failed to stat %s: %m", from);
1361 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1363 log_error("%s is not a char or block device, cannot copy", from);
1367 r = mkdir_parents(to, 0775);
1369 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1373 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1374 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1381 static int setup_ptmx(const char *dest) {
1382 _cleanup_free_ char *p = NULL;
1384 p = strappend(dest, "/dev/ptmx");
1388 if (symlink("pts/ptmx", p) < 0)
1389 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1394 static int setup_dev_console(const char *dest, const char *console) {
1395 _cleanup_umask_ mode_t u;
1405 if (stat("/dev/null", &st) < 0)
1406 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1408 r = chmod_and_chown(console, 0600, 0, 0);
1410 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1412 /* We need to bind mount the right tty to /dev/console since
1413 * ptys can only exist on pts file systems. To have something
1414 * to bind mount things on we create a device node first, and
1415 * use /dev/null for that since we the cgroups device policy
1416 * allows us to create that freely, while we cannot create
1417 * /dev/console. (Note that the major minor doesn't actually
1418 * matter here, since we mount it over anyway). */
1420 to = strappenda(dest, "/dev/console");
1421 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1422 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1424 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1425 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1430 static int setup_kmsg(const char *dest, int kmsg_socket) {
1431 _cleanup_free_ char *from = NULL, *to = NULL;
1432 _cleanup_umask_ mode_t u;
1435 struct cmsghdr cmsghdr;
1436 uint8_t buf[CMSG_SPACE(sizeof(int))];
1438 struct msghdr mh = {
1439 .msg_control = &control,
1440 .msg_controllen = sizeof(control),
1442 struct cmsghdr *cmsg;
1445 assert(kmsg_socket >= 0);
1449 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1450 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1451 * on the reading side behave very similar to /proc/kmsg,
1452 * their writing side behaves differently from /dev/kmsg in
1453 * that writing blocks when nothing is reading. In order to
1454 * avoid any problems with containers deadlocking due to this
1455 * we simply make /dev/kmsg unavailable to the container. */
1456 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1457 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1460 if (mkfifo(from, 0600) < 0)
1461 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1463 r = chmod_and_chown(from, 0600, 0, 0);
1465 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1467 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1468 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1470 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1472 return log_error_errno(errno, "Failed to open fifo: %m");
1474 cmsg = CMSG_FIRSTHDR(&mh);
1475 cmsg->cmsg_level = SOL_SOCKET;
1476 cmsg->cmsg_type = SCM_RIGHTS;
1477 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1478 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1480 mh.msg_controllen = cmsg->cmsg_len;
1482 /* Store away the fd in the socket, so that it stays open as
1483 * long as we run the child */
1484 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1488 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1490 /* And now make the FIFO unavailable as /dev/kmsg... */
1495 static int send_rtnl(int send_fd) {
1497 struct cmsghdr cmsghdr;
1498 uint8_t buf[CMSG_SPACE(sizeof(int))];
1500 struct msghdr mh = {
1501 .msg_control = &control,
1502 .msg_controllen = sizeof(control),
1504 struct cmsghdr *cmsg;
1505 _cleanup_close_ int fd = -1;
1508 assert(send_fd >= 0);
1510 if (!arg_expose_ports)
1513 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1515 return log_error_errno(errno, "failed to allocate container netlink: %m");
1517 cmsg = CMSG_FIRSTHDR(&mh);
1518 cmsg->cmsg_level = SOL_SOCKET;
1519 cmsg->cmsg_type = SCM_RIGHTS;
1520 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1521 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1523 mh.msg_controllen = cmsg->cmsg_len;
1525 /* Store away the fd in the socket, so that it stays open as
1526 * long as we run the child */
1527 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1529 return log_error_errno(errno, "Failed to send netlink fd: %m");
1534 static int flush_ports(union in_addr_union *exposed) {
1536 int r, af = AF_INET;
1540 if (!arg_expose_ports)
1543 if (in_addr_is_null(af, exposed))
1546 log_debug("Lost IP address.");
1548 LIST_FOREACH(ports, p, arg_expose_ports) {
1549 r = fw_add_local_dnat(false,
1560 log_warning_errno(r, "Failed to modify firewall: %m");
1563 *exposed = IN_ADDR_NULL;
1567 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1568 _cleanup_free_ struct local_address *addresses = NULL;
1569 _cleanup_free_ char *pretty = NULL;
1570 union in_addr_union new_exposed;
1573 int af = AF_INET, r;
1577 /* Invoked each time an address is added or removed inside the
1580 if (!arg_expose_ports)
1583 r = local_addresses(rtnl, 0, af, &addresses);
1585 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1588 addresses[0].family == af &&
1589 addresses[0].scope < RT_SCOPE_LINK;
1592 return flush_ports(exposed);
1594 new_exposed = addresses[0].address;
1595 if (in_addr_equal(af, exposed, &new_exposed))
1598 in_addr_to_string(af, &new_exposed, &pretty);
1599 log_debug("New container IP is %s.", strna(pretty));
1601 LIST_FOREACH(ports, p, arg_expose_ports) {
1603 r = fw_add_local_dnat(true,
1612 in_addr_is_null(af, exposed) ? NULL : exposed);
1614 log_warning_errno(r, "Failed to modify firewall: %m");
1617 *exposed = new_exposed;
1621 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1622 union in_addr_union *exposed = userdata;
1628 expose_ports(rtnl, exposed);
1632 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1634 struct cmsghdr cmsghdr;
1635 uint8_t buf[CMSG_SPACE(sizeof(int))];
1637 struct msghdr mh = {
1638 .msg_control = &control,
1639 .msg_controllen = sizeof(control),
1641 struct cmsghdr *cmsg;
1642 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1647 assert(recv_fd >= 0);
1650 if (!arg_expose_ports)
1653 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1655 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1657 cmsg = CMSG_FIRSTHDR(&mh);
1658 assert(cmsg->cmsg_level == SOL_SOCKET);
1659 assert(cmsg->cmsg_type == SCM_RIGHTS);
1660 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1661 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1663 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1666 return log_error_errno(r, "Failed to create rtnl object: %m");
1669 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1671 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1673 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1675 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1677 r = sd_rtnl_attach_event(rtnl, event, 0);
1679 return log_error_errno(r, "Failed to add to even loop: %m");
1687 static int setup_hostname(void) {
1689 if (arg_share_system)
1692 if (sethostname_idempotent(arg_machine) < 0)
1698 static int setup_journal(const char *directory) {
1699 sd_id128_t machine_id, this_id;
1700 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1704 /* Don't link journals in ephemeral mode */
1708 p = strappend(directory, "/etc/machine-id");
1712 r = read_one_line_file(p, &b);
1713 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1716 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1719 if (isempty(id) && arg_link_journal == LINK_AUTO)
1722 /* Verify validity */
1723 r = sd_id128_from_string(id, &machine_id);
1725 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1727 r = sd_id128_get_machine(&this_id);
1729 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1731 if (sd_id128_equal(machine_id, this_id)) {
1732 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1733 "Host and machine ids are equal (%s): refusing to link journals", id);
1734 if (arg_link_journal == LINK_AUTO)
1739 if (arg_link_journal == LINK_NO)
1743 p = strappend("/var/log/journal/", id);
1744 q = strjoin(directory, "/var/log/journal/", id, NULL);
1748 if (path_is_mount_point(p, false) > 0) {
1749 if (arg_link_journal != LINK_AUTO) {
1750 log_error("%s: already a mount point, refusing to use for journal", p);
1757 if (path_is_mount_point(q, false) > 0) {
1758 if (arg_link_journal != LINK_AUTO) {
1759 log_error("%s: already a mount point, refusing to use for journal", q);
1766 r = readlink_and_make_absolute(p, &d);
1768 if ((arg_link_journal == LINK_GUEST ||
1769 arg_link_journal == LINK_AUTO) &&
1772 r = mkdir_p(q, 0755);
1774 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1779 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1780 } else if (r == -EINVAL) {
1782 if (arg_link_journal == LINK_GUEST &&
1785 if (errno == ENOTDIR) {
1786 log_error("%s already exists and is neither a symlink nor a directory", p);
1789 log_error_errno(errno, "Failed to remove %s: %m", p);
1793 } else if (r != -ENOENT) {
1794 log_error_errno(errno, "readlink(%s) failed: %m", p);
1798 if (arg_link_journal == LINK_GUEST) {
1800 if (symlink(q, p) < 0) {
1801 if (arg_link_journal_try) {
1802 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1805 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1810 r = mkdir_p(q, 0755);
1812 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1816 if (arg_link_journal == LINK_HOST) {
1817 /* don't create parents here -- if the host doesn't have
1818 * permanent journal set up, don't force it here */
1821 if (arg_link_journal_try) {
1822 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1825 log_error_errno(errno, "Failed to create %s: %m", p);
1830 } else if (access(p, F_OK) < 0)
1833 if (dir_is_empty(q) == 0)
1834 log_warning("%s is not empty, proceeding anyway.", q);
1836 r = mkdir_p(q, 0755);
1838 log_error_errno(errno, "Failed to create %s: %m", q);
1842 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1843 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1848 static int drop_capabilities(void) {
1849 return capability_bounding_set_drop(~arg_retain, false);
1852 static int register_machine(pid_t pid, int local_ifindex) {
1853 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1854 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1860 r = sd_bus_default_system(&bus);
1862 return log_error_errno(r, "Failed to open system bus: %m");
1864 if (arg_keep_unit) {
1865 r = sd_bus_call_method(
1867 "org.freedesktop.machine1",
1868 "/org/freedesktop/machine1",
1869 "org.freedesktop.machine1.Manager",
1870 "RegisterMachineWithNetwork",
1875 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1879 strempty(arg_directory),
1880 local_ifindex > 0 ? 1 : 0, local_ifindex);
1882 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1884 r = sd_bus_message_new_method_call(
1887 "org.freedesktop.machine1",
1888 "/org/freedesktop/machine1",
1889 "org.freedesktop.machine1.Manager",
1890 "CreateMachineWithNetwork");
1892 return log_error_errno(r, "Failed to create message: %m");
1894 r = sd_bus_message_append(
1898 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1902 strempty(arg_directory),
1903 local_ifindex > 0 ? 1 : 0, local_ifindex);
1905 return log_error_errno(r, "Failed to append message arguments: %m");
1907 r = sd_bus_message_open_container(m, 'a', "(sv)");
1909 return log_error_errno(r, "Failed to open container: %m");
1911 if (!isempty(arg_slice)) {
1912 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1914 return log_error_errno(r, "Failed to append slice: %m");
1917 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1919 return log_error_errno(r, "Failed to add device policy: %m");
1921 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1922 /* Allow the container to
1923 * access and create the API
1924 * device nodes, so that
1925 * PrivateDevices= in the
1926 * container can work
1931 "/dev/random", "rwm",
1932 "/dev/urandom", "rwm",
1934 "/dev/net/tun", "rwm",
1935 /* Allow the container
1936 * access to ptys. However,
1938 * container to ever create
1939 * these device nodes. */
1940 "/dev/pts/ptmx", "rw",
1943 return log_error_errno(r, "Failed to add device whitelist: %m");
1945 r = sd_bus_message_close_container(m);
1947 return log_error_errno(r, "Failed to close container: %m");
1949 r = sd_bus_call(bus, m, 0, &error, NULL);
1953 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1960 static int terminate_machine(pid_t pid) {
1961 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1962 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1963 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1970 r = sd_bus_default_system(&bus);
1972 return log_error_errno(r, "Failed to open system bus: %m");
1974 r = sd_bus_call_method(
1976 "org.freedesktop.machine1",
1977 "/org/freedesktop/machine1",
1978 "org.freedesktop.machine1.Manager",
1985 /* Note that the machine might already have been
1986 * cleaned up automatically, hence don't consider it a
1987 * failure if we cannot get the machine object. */
1988 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1992 r = sd_bus_message_read(reply, "o", &path);
1994 return bus_log_parse_error(r);
1996 r = sd_bus_call_method(
1998 "org.freedesktop.machine1",
2000 "org.freedesktop.machine1.Machine",
2006 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2013 static int reset_audit_loginuid(void) {
2014 _cleanup_free_ char *p = NULL;
2017 if (arg_share_system)
2020 r = read_one_line_file("/proc/self/loginuid", &p);
2024 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2026 /* Already reset? */
2027 if (streq(p, "4294967295"))
2030 r = write_string_file("/proc/self/loginuid", "4294967295");
2032 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2033 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2034 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2035 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2036 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2044 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2045 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2046 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2048 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2054 l = strlen(arg_machine);
2055 sz = sizeof(sd_id128_t) + l;
2061 /* fetch some persistent data unique to the host */
2062 r = sd_id128_get_machine((sd_id128_t*) v);
2066 /* combine with some data unique (on this host) to this
2067 * container instance */
2068 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2071 memcpy(i, &idx, sizeof(idx));
2074 /* Let's hash the host machine ID plus the container name. We
2075 * use a fixed, but originally randomly created hash key here. */
2076 siphash24(result, v, sz, hash_key.bytes);
2078 assert_cc(ETH_ALEN <= sizeof(result));
2079 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2081 /* see eth_random_addr in the kernel */
2082 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2083 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2088 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2089 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2090 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2091 struct ether_addr mac_host, mac_container;
2094 if (!arg_private_network)
2097 if (!arg_network_veth)
2100 /* Use two different interface name prefixes depending whether
2101 * we are in bridge mode or not. */
2102 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2103 arg_network_bridge ? "vb" : "ve", arg_machine);
2105 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2107 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2109 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2111 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2113 r = sd_rtnl_open(&rtnl, 0);
2115 return log_error_errno(r, "Failed to connect to netlink: %m");
2117 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2119 return log_error_errno(r, "Failed to allocate netlink message: %m");
2121 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2123 return log_error_errno(r, "Failed to add netlink interface name: %m");
2125 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2127 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2129 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2131 return log_error_errno(r, "Failed to open netlink container: %m");
2133 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2135 return log_error_errno(r, "Failed to open netlink container: %m");
2137 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2139 return log_error_errno(r, "Failed to open netlink container: %m");
2141 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2143 return log_error_errno(r, "Failed to add netlink interface name: %m");
2145 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2147 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2149 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2151 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2153 r = sd_rtnl_message_close_container(m);
2155 return log_error_errno(r, "Failed to close netlink container: %m");
2157 r = sd_rtnl_message_close_container(m);
2159 return log_error_errno(r, "Failed to close netlink container: %m");
2161 r = sd_rtnl_message_close_container(m);
2163 return log_error_errno(r, "Failed to close netlink container: %m");
2165 r = sd_rtnl_call(rtnl, m, 0, NULL);
2167 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2169 i = (int) if_nametoindex(iface_name);
2171 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2178 static int setup_bridge(const char veth_name[], int *ifi) {
2179 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2180 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2183 if (!arg_private_network)
2186 if (!arg_network_veth)
2189 if (!arg_network_bridge)
2192 bridge = (int) if_nametoindex(arg_network_bridge);
2194 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2198 r = sd_rtnl_open(&rtnl, 0);
2200 return log_error_errno(r, "Failed to connect to netlink: %m");
2202 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2204 return log_error_errno(r, "Failed to allocate netlink message: %m");
2206 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2208 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2210 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2212 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2214 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2216 return log_error_errno(r, "Failed to add netlink master field: %m");
2218 r = sd_rtnl_call(rtnl, m, 0, NULL);
2220 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2225 static int parse_interface(struct udev *udev, const char *name) {
2226 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2227 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2230 ifi = (int) if_nametoindex(name);
2232 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2234 sprintf(ifi_str, "n%i", ifi);
2235 d = udev_device_new_from_device_id(udev, ifi_str);
2237 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2239 if (udev_device_get_is_initialized(d) <= 0) {
2240 log_error("Network interface %s is not initialized yet.", name);
2247 static int move_network_interfaces(pid_t pid) {
2248 _cleanup_udev_unref_ struct udev *udev = NULL;
2249 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2253 if (!arg_private_network)
2256 if (strv_isempty(arg_network_interfaces))
2259 r = sd_rtnl_open(&rtnl, 0);
2261 return log_error_errno(r, "Failed to connect to netlink: %m");
2265 log_error("Failed to connect to udev.");
2269 STRV_FOREACH(i, arg_network_interfaces) {
2270 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2273 ifi = parse_interface(udev, *i);
2277 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2279 return log_error_errno(r, "Failed to allocate netlink message: %m");
2281 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2283 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2285 r = sd_rtnl_call(rtnl, m, 0, NULL);
2287 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2293 static int setup_macvlan(pid_t pid) {
2294 _cleanup_udev_unref_ struct udev *udev = NULL;
2295 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2300 if (!arg_private_network)
2303 if (strv_isempty(arg_network_macvlan))
2306 r = sd_rtnl_open(&rtnl, 0);
2308 return log_error_errno(r, "Failed to connect to netlink: %m");
2312 log_error("Failed to connect to udev.");
2316 STRV_FOREACH(i, arg_network_macvlan) {
2317 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2318 _cleanup_free_ char *n = NULL;
2319 struct ether_addr mac;
2322 ifi = parse_interface(udev, *i);
2326 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2328 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2330 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2332 return log_error_errno(r, "Failed to allocate netlink message: %m");
2334 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2336 return log_error_errno(r, "Failed to add netlink interface index: %m");
2338 n = strappend("mv-", *i);
2342 strshorten(n, IFNAMSIZ-1);
2344 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2346 return log_error_errno(r, "Failed to add netlink interface name: %m");
2348 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2350 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2352 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2354 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2356 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2358 return log_error_errno(r, "Failed to open netlink container: %m");
2360 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2362 return log_error_errno(r, "Failed to open netlink container: %m");
2364 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2366 return log_error_errno(r, "Failed to append macvlan mode: %m");
2368 r = sd_rtnl_message_close_container(m);
2370 return log_error_errno(r, "Failed to close netlink container: %m");
2372 r = sd_rtnl_message_close_container(m);
2374 return log_error_errno(r, "Failed to close netlink container: %m");
2376 r = sd_rtnl_call(rtnl, m, 0, NULL);
2378 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2384 static int setup_seccomp(void) {
2387 static const int blacklist[] = {
2388 SCMP_SYS(kexec_load),
2389 SCMP_SYS(open_by_handle_at),
2390 SCMP_SYS(init_module),
2391 SCMP_SYS(finit_module),
2392 SCMP_SYS(delete_module),
2399 scmp_filter_ctx seccomp;
2403 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2407 r = seccomp_add_secondary_archs(seccomp);
2409 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2413 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2414 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2416 continue; /* unknown syscall */
2418 log_error_errno(r, "Failed to block syscall: %m");
2424 Audit is broken in containers, much of the userspace audit
2425 hookup will fail if running inside a container. We don't
2426 care and just turn off creation of audit sockets.
2428 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2429 with EAFNOSUPPORT which audit userspace uses as indication
2430 that audit is disabled in the kernel.
2433 r = seccomp_rule_add(
2435 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2438 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2439 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2441 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2445 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2447 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2451 r = seccomp_load(seccomp);
2453 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2456 seccomp_release(seccomp);
2464 static int setup_propagate(const char *root) {
2467 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2468 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2469 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2470 (void) mkdir_p(p, 0600);
2472 q = strappenda(root, "/run/systemd/nspawn/incoming");
2473 mkdir_parents(q, 0755);
2476 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2477 return log_error_errno(errno, "Failed to install propagation bind mount.");
2479 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2480 return log_error_errno(errno, "Failed to make propagation mount read-only");
2485 static int setup_image(char **device_path, int *loop_nr) {
2486 struct loop_info64 info = {
2487 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2489 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2490 _cleanup_free_ char* loopdev = NULL;
2494 assert(device_path);
2498 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2500 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2502 if (fstat(fd, &st) < 0)
2503 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2505 if (S_ISBLK(st.st_mode)) {
2508 p = strdup(arg_image);
2522 if (!S_ISREG(st.st_mode)) {
2523 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2527 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2529 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2531 nr = ioctl(control, LOOP_CTL_GET_FREE);
2533 return log_error_errno(errno, "Failed to allocate loop device: %m");
2535 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2538 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2540 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2542 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2543 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2546 info.lo_flags |= LO_FLAGS_READ_ONLY;
2548 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2549 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2551 *device_path = loopdev;
2562 static int wait_for_block_device(struct udev *udev, dev_t devnum, struct udev_device **ret) {
2563 _cleanup_udev_monitor_unref_ struct udev_monitor *monitor = NULL;
2570 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2571 struct pollfd pfd = {
2575 d = udev_device_new_from_devnum(udev, 'b', devnum);
2579 r = udev_device_get_is_initialized(d);
2581 return log_error_errno(r, "Failed to check if device is initialized: %m");
2587 d = udev_device_unref(d);
2590 monitor = udev_monitor_new_from_netlink(udev, "udev");
2594 r = udev_monitor_filter_add_match_subsystem_devtype(monitor, "block", NULL);
2596 return log_error_errno(r, "Failed to add block match: %m");
2598 r = udev_monitor_enable_receiving(monitor);
2600 return log_error_errno(r, "Failed to turn on monitor: %m");
2605 pfd.fd = udev_monitor_get_fd(monitor);
2607 return log_error_errno(r, "Failed to get udev monitor fd: %m");
2609 r = poll(&pfd, 1, -1);
2611 return log_error_errno(errno, "Failed to wait for device initialization: %m");
2613 d = udev_monitor_receive_device(monitor);
2619 #define PARTITION_TABLE_BLURB \
2620 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2621 "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2622 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2623 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2624 "to be bootable with systemd-nspawn."
2626 static int dissect_image(
2628 char **root_device, bool *root_device_rw,
2629 char **home_device, bool *home_device_rw,
2630 char **srv_device, bool *srv_device_rw,
2634 int home_nr = -1, srv_nr = -1;
2635 #ifdef GPT_ROOT_NATIVE
2638 #ifdef GPT_ROOT_SECONDARY
2639 int secondary_root_nr = -1;
2641 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2642 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2643 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2644 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2645 _cleanup_udev_unref_ struct udev *udev = NULL;
2646 struct udev_list_entry *first, *item;
2647 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2648 const char *pttype = NULL;
2652 bool is_gpt, is_mbr, multiple_generic = false;
2655 assert(root_device);
2656 assert(home_device);
2661 b = blkid_new_probe();
2666 r = blkid_probe_set_device(b, fd, 0, 0);
2671 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2675 blkid_probe_enable_partitions(b, 1);
2676 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2679 r = blkid_do_safeprobe(b);
2680 if (r == -2 || r == 1) {
2681 log_error("Failed to identify any partition table on\n"
2683 PARTITION_TABLE_BLURB, arg_image);
2685 } else if (r != 0) {
2688 log_error_errno(errno, "Failed to probe: %m");
2692 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2694 is_gpt = streq_ptr(pttype, "gpt");
2695 is_mbr = streq_ptr(pttype, "dos");
2697 if (!is_gpt && !is_mbr) {
2698 log_error("No GPT or MBR partition table discovered on\n"
2700 PARTITION_TABLE_BLURB, arg_image);
2705 pl = blkid_probe_get_partitions(b);
2710 log_error("Failed to list partitions of %s", arg_image);
2718 if (fstat(fd, &st) < 0)
2719 return log_error_errno(errno, "Failed to stat block device: %m");
2721 r = wait_for_block_device(udev, st.st_rdev, &d);
2725 e = udev_enumerate_new(udev);
2729 r = udev_enumerate_add_match_parent(e, d);
2733 r = udev_enumerate_scan_devices(e);
2735 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2737 first = udev_enumerate_get_list_entry(e);
2738 udev_list_entry_foreach(item, first) {
2739 _cleanup_udev_device_unref_ struct udev_device *q;
2741 unsigned long long flags;
2747 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2752 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2756 qn = udev_device_get_devnum(q);
2760 if (st.st_rdev == qn)
2763 node = udev_device_get_devnode(q);
2767 pp = blkid_partlist_devno_to_partition(pl, qn);
2771 flags = blkid_partition_get_flags(pp);
2773 nr = blkid_partition_get_partno(pp);
2781 if (flags & GPT_FLAG_NO_AUTO)
2784 stype = blkid_partition_get_type_string(pp);
2788 if (sd_id128_from_string(stype, &type_id) < 0)
2791 if (sd_id128_equal(type_id, GPT_HOME)) {
2793 if (home && nr >= home_nr)
2797 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2799 r = free_and_strdup(&home, node);
2803 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2805 if (srv && nr >= srv_nr)
2809 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2811 r = free_and_strdup(&srv, node);
2815 #ifdef GPT_ROOT_NATIVE
2816 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2818 if (root && nr >= root_nr)
2822 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2824 r = free_and_strdup(&root, node);
2829 #ifdef GPT_ROOT_SECONDARY
2830 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2832 if (secondary_root && nr >= secondary_root_nr)
2835 secondary_root_nr = nr;
2836 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2838 r = free_and_strdup(&secondary_root, node);
2843 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2846 multiple_generic = true;
2848 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2850 r = free_and_strdup(&generic, node);
2856 } else if (is_mbr) {
2859 if (flags != 0x80) /* Bootable flag */
2862 type = blkid_partition_get_type(pp);
2863 if (type != 0x83) /* Linux partition */
2867 multiple_generic = true;
2871 r = free_and_strdup(&root, node);
2879 *root_device = root;
2882 *root_device_rw = root_rw;
2884 } else if (secondary_root) {
2885 *root_device = secondary_root;
2886 secondary_root = NULL;
2888 *root_device_rw = secondary_root_rw;
2890 } else if (generic) {
2892 /* There were no partitions with precise meanings
2893 * around, but we found generic partitions. In this
2894 * case, if there's only one, we can go ahead and boot
2895 * it, otherwise we bail out, because we really cannot
2896 * make any sense of it. */
2898 if (multiple_generic) {
2899 log_error("Identified multiple bootable Linux partitions on\n"
2901 PARTITION_TABLE_BLURB, arg_image);
2905 *root_device = generic;
2908 *root_device_rw = generic_rw;
2911 log_error("Failed to identify root partition in disk image\n"
2913 PARTITION_TABLE_BLURB, arg_image);
2918 *home_device = home;
2921 *home_device_rw = home_rw;
2928 *srv_device_rw = srv_rw;
2933 log_error("--image= is not supported, compiled without blkid support.");
2938 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2940 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2941 const char *fstype, *p;
2951 p = strappenda(where, directory);
2956 b = blkid_new_probe_from_filename(what);
2960 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2964 blkid_probe_enable_superblocks(b, 1);
2965 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2968 r = blkid_do_safeprobe(b);
2969 if (r == -1 || r == 1) {
2970 log_error("Cannot determine file system type of %s", what);
2972 } else if (r != 0) {
2975 log_error_errno(errno, "Failed to probe %s: %m", what);
2980 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2983 log_error("Failed to determine file system type of %s", what);
2987 if (streq(fstype, "crypto_LUKS")) {
2988 log_error("nspawn currently does not support LUKS disk images.");
2992 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2993 return log_error_errno(errno, "Failed to mount %s: %m", what);
2997 log_error("--image= is not supported, compiled without blkid support.");
3002 static int mount_devices(
3004 const char *root_device, bool root_device_rw,
3005 const char *home_device, bool home_device_rw,
3006 const char *srv_device, bool srv_device_rw) {
3012 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3014 return log_error_errno(r, "Failed to mount root directory: %m");
3018 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3020 return log_error_errno(r, "Failed to mount home directory: %m");
3024 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3026 return log_error_errno(r, "Failed to mount server data directory: %m");
3032 static void loop_remove(int nr, int *image_fd) {
3033 _cleanup_close_ int control = -1;
3039 if (image_fd && *image_fd >= 0) {
3040 r = ioctl(*image_fd, LOOP_CLR_FD);
3042 log_debug_errno(errno, "Failed to close loop image: %m");
3043 *image_fd = safe_close(*image_fd);
3046 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3048 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3052 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3054 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3057 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3065 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3066 return log_error_errno(errno, "Failed to allocate pipe: %m");
3070 return log_error_errno(errno, "Failed to fork getent child: %m");
3071 else if (pid == 0) {
3073 char *empty_env = NULL;
3075 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3076 _exit(EXIT_FAILURE);
3078 if (pipe_fds[0] > 2)
3079 safe_close(pipe_fds[0]);
3080 if (pipe_fds[1] > 2)
3081 safe_close(pipe_fds[1]);
3083 nullfd = open("/dev/null", O_RDWR);
3085 _exit(EXIT_FAILURE);
3087 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3088 _exit(EXIT_FAILURE);
3090 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3091 _exit(EXIT_FAILURE);
3096 reset_all_signal_handlers();
3097 close_all_fds(NULL, 0);
3099 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3100 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3101 _exit(EXIT_FAILURE);
3104 pipe_fds[1] = safe_close(pipe_fds[1]);
3111 static int change_uid_gid(char **_home) {
3112 char line[LINE_MAX], *x, *u, *g, *h;
3113 const char *word, *state;
3114 _cleanup_free_ uid_t *uids = NULL;
3115 _cleanup_free_ char *home = NULL;
3116 _cleanup_fclose_ FILE *f = NULL;
3117 _cleanup_close_ int fd = -1;
3118 unsigned n_uids = 0;
3127 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3128 /* Reset everything fully to 0, just in case */
3130 if (setgroups(0, NULL) < 0)
3131 return log_error_errno(errno, "setgroups() failed: %m");
3133 if (setresgid(0, 0, 0) < 0)
3134 return log_error_errno(errno, "setregid() failed: %m");
3136 if (setresuid(0, 0, 0) < 0)
3137 return log_error_errno(errno, "setreuid() failed: %m");
3143 /* First, get user credentials */
3144 fd = spawn_getent("passwd", arg_user, &pid);
3148 f = fdopen(fd, "r");
3153 if (!fgets(line, sizeof(line), f)) {
3156 log_error("Failed to resolve user %s.", arg_user);
3160 log_error_errno(errno, "Failed to read from getent: %m");
3166 wait_for_terminate_and_warn("getent passwd", pid, true);
3168 x = strchr(line, ':');
3170 log_error("/etc/passwd entry has invalid user field.");
3174 u = strchr(x+1, ':');
3176 log_error("/etc/passwd entry has invalid password field.");
3183 log_error("/etc/passwd entry has invalid UID field.");
3191 log_error("/etc/passwd entry has invalid GID field.");
3196 h = strchr(x+1, ':');
3198 log_error("/etc/passwd entry has invalid GECOS field.");
3205 log_error("/etc/passwd entry has invalid home directory field.");
3211 r = parse_uid(u, &uid);
3213 log_error("Failed to parse UID of user.");
3217 r = parse_gid(g, &gid);
3219 log_error("Failed to parse GID of user.");
3227 /* Second, get group memberships */
3228 fd = spawn_getent("initgroups", arg_user, &pid);
3233 f = fdopen(fd, "r");
3238 if (!fgets(line, sizeof(line), f)) {
3240 log_error("Failed to resolve user %s.", arg_user);
3244 log_error_errno(errno, "Failed to read from getent: %m");
3250 wait_for_terminate_and_warn("getent initgroups", pid, true);
3252 /* Skip over the username and subsequent separator whitespace */
3254 x += strcspn(x, WHITESPACE);
3255 x += strspn(x, WHITESPACE);
3257 FOREACH_WORD(word, l, x, state) {
3263 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3266 r = parse_uid(c, &uids[n_uids++]);
3268 log_error("Failed to parse group data from getent.");
3273 r = mkdir_parents(home, 0775);
3275 return log_error_errno(r, "Failed to make home root directory: %m");
3277 r = mkdir_safe(home, 0755, uid, gid);
3278 if (r < 0 && r != -EEXIST)
3279 return log_error_errno(r, "Failed to make home directory: %m");
3281 fchown(STDIN_FILENO, uid, gid);
3282 fchown(STDOUT_FILENO, uid, gid);
3283 fchown(STDERR_FILENO, uid, gid);
3285 if (setgroups(n_uids, uids) < 0)
3286 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3288 if (setresgid(gid, gid, gid) < 0)
3289 return log_error_errno(errno, "setregid() failed: %m");
3291 if (setresuid(uid, uid, uid) < 0)
3292 return log_error_errno(errno, "setreuid() failed: %m");
3304 * < 0 : wait_for_terminate() failed to get the state of the
3305 * container, the container was terminated by a signal, or
3306 * failed for an unknown reason. No change is made to the
3307 * container argument.
3308 * > 0 : The program executed in the container terminated with an
3309 * error. The exit code of the program executed in the
3310 * container is returned. The container argument has been set
3311 * to CONTAINER_TERMINATED.
3312 * 0 : The container is being rebooted, has been shut down or exited
3313 * successfully. The container argument has been set to either
3314 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3316 * That is, success is indicated by a return value of zero, and an
3317 * error is indicated by a non-zero value.
3319 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3323 r = wait_for_terminate(pid, &status);
3325 return log_warning_errno(r, "Failed to wait for container: %m");
3327 switch (status.si_code) {
3330 if (status.si_status == 0) {
3331 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3334 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3336 *container = CONTAINER_TERMINATED;
3337 return status.si_status;
3340 if (status.si_status == SIGINT) {
3342 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3343 *container = CONTAINER_TERMINATED;
3346 } else if (status.si_status == SIGHUP) {
3348 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3349 *container = CONTAINER_REBOOTED;
3353 /* CLD_KILLED fallthrough */
3356 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3360 log_error("Container %s failed due to unknown reason.", arg_machine);
3367 static void nop_handler(int sig) {}
3369 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3372 pid = PTR_TO_UINT32(userdata);
3374 if (kill(pid, SIGRTMIN+3) >= 0) {
3375 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3376 sd_event_source_set_userdata(s, NULL);
3381 sd_event_exit(sd_event_source_get_event(s), 0);
3385 static int determine_names(void) {
3388 if (!arg_image && !arg_directory) {
3390 _cleanup_(image_unrefp) Image *i = NULL;
3392 r = image_find(arg_machine, &i);
3394 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3396 log_error("No image for machine '%s': %m", arg_machine);
3400 if (i->type == IMAGE_RAW)
3401 r = set_sanitized_path(&arg_image, i->path);
3403 r = set_sanitized_path(&arg_directory, i->path);
3405 return log_error_errno(r, "Invalid image directory: %m");
3407 arg_read_only = arg_read_only || i->read_only;
3409 arg_directory = get_current_dir_name();
3411 if (!arg_directory && !arg_machine) {
3412 log_error("Failed to determine path, please use -D or -i.");
3418 if (arg_directory && path_equal(arg_directory, "/"))
3419 arg_machine = gethostname_malloc();
3421 arg_machine = strdup(basename(arg_image ?: arg_directory));
3426 hostname_cleanup(arg_machine, false);
3427 if (!machine_name_is_valid(arg_machine)) {
3428 log_error("Failed to determine machine name automatically, please use -M.");
3432 if (arg_ephemeral) {
3435 /* Add a random suffix when this is an
3436 * ephemeral machine, so that we can run many
3437 * instances at once without manually having
3438 * to specify -M each time. */
3440 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3451 int main(int argc, char *argv[]) {
3453 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3454 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3455 _cleanup_close_ int master = -1, image_fd = -1;
3456 _cleanup_fdset_free_ FDSet *fds = NULL;
3457 int r, n_fd_passed, loop_nr = -1;
3458 char veth_name[IFNAMSIZ];
3459 bool secondary = false, remove_subvol = false;
3460 sigset_t mask, mask_chld;
3462 int ret = EXIT_SUCCESS;
3463 union in_addr_union exposed = {};
3464 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3466 log_parse_environment();
3469 r = parse_argv(argc, argv);
3473 r = determine_names();
3477 if (geteuid() != 0) {
3478 log_error("Need to be root.");
3483 if (sd_booted() <= 0) {
3484 log_error("Not running on a systemd system.");
3490 n_fd_passed = sd_listen_fds(false);
3491 if (n_fd_passed > 0) {
3492 r = fdset_new_listen_fds(&fds, false);
3494 log_error_errno(r, "Failed to collect file descriptors: %m");
3498 fdset_close_others(fds);
3501 if (arg_directory) {
3504 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3505 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3510 if (arg_ephemeral) {
3511 _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3514 /* If the specified path is a mount point we
3515 * generate the new snapshot immediately
3516 * inside it under a random name. However if
3517 * the specified is not a mount point we
3518 * create the new snapshot in the parent
3519 * directory, just next to it. */
3520 r = path_is_mount_point(arg_directory, false);
3522 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3526 r = tempfn_random_child(arg_directory, &np);
3528 r = tempfn_random(arg_directory, &np);
3530 log_error_errno(r, "Failed to generate name for snapshot: %m");
3534 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3536 log_error_errno(r, "Failed to lock %s: %m", np);
3540 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3543 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3547 free(arg_directory);
3550 remove_subvol = true;
3553 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3555 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3559 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3564 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3567 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3569 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3573 log_info("Populated %s from template %s.", arg_directory, arg_template);
3579 if (path_is_os_tree(arg_directory) <= 0) {
3580 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3587 p = strappenda(arg_directory,
3588 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3589 if (access(p, F_OK) < 0) {
3590 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3597 char template[] = "/tmp/nspawn-root-XXXXXX";
3600 assert(!arg_template);
3602 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3604 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3608 r = log_error_errno(r, "Failed to create image lock: %m");
3612 if (!mkdtemp(template)) {
3613 log_error_errno(errno, "Failed to create temporary directory: %m");
3618 arg_directory = strdup(template);
3619 if (!arg_directory) {
3624 image_fd = setup_image(&device_path, &loop_nr);
3630 r = dissect_image(image_fd,
3631 &root_device, &root_device_rw,
3632 &home_device, &home_device_rw,
3633 &srv_device, &srv_device_rw,
3639 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3641 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3645 r = ptsname_malloc(master, &console);
3647 r = log_error_errno(r, "Failed to determine tty name: %m");
3652 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3653 arg_machine, arg_image ?: arg_directory);
3655 if (unlockpt(master) < 0) {
3656 r = log_error_errno(errno, "Failed to unlock tty: %m");
3660 assert_se(sigemptyset(&mask) == 0);
3661 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3662 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3664 assert_se(sigemptyset(&mask_chld) == 0);
3665 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3668 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3669 ContainerStatus container_status;
3670 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3671 struct sigaction sa = {
3672 .sa_handler = nop_handler,
3673 .sa_flags = SA_NOCLDSTOP,
3676 r = barrier_create(&barrier);
3678 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3682 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3683 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3687 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3688 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3692 /* Child can be killed before execv(), so handle SIGCHLD
3693 * in order to interrupt parent's blocking calls and
3694 * give it a chance to call wait() and terminate. */
3695 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3697 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3701 r = sigaction(SIGCHLD, &sa, NULL);
3703 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3707 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3708 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3709 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3711 if (errno == EINVAL)
3712 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3714 r = log_error_errno(errno, "clone() failed: %m");
3721 _cleanup_free_ char *home = NULL;
3723 const char *envp[] = {
3724 "PATH=" DEFAULT_PATH_SPLIT_USR,
3725 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3730 NULL, /* container_uuid */
3731 NULL, /* LISTEN_FDS */
3732 NULL, /* LISTEN_PID */
3737 barrier_set_role(&barrier, BARRIER_CHILD);
3739 envp[n_env] = strv_find_prefix(environ, "TERM=");
3743 master = safe_close(master);
3745 close_nointr(STDIN_FILENO);
3746 close_nointr(STDOUT_FILENO);
3747 close_nointr(STDERR_FILENO);
3749 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3750 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3752 reset_all_signal_handlers();
3753 reset_signal_mask();
3755 r = open_terminal(console, O_RDWR);
3756 if (r != STDIN_FILENO) {
3762 log_error_errno(r, "Failed to open console: %m");
3763 _exit(EXIT_FAILURE);
3766 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3767 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3768 log_error_errno(errno, "Failed to duplicate console: %m");
3769 _exit(EXIT_FAILURE);
3773 log_error_errno(errno, "setsid() failed: %m");
3774 _exit(EXIT_FAILURE);
3777 if (reset_audit_loginuid() < 0)
3778 _exit(EXIT_FAILURE);
3780 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3781 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3782 _exit(EXIT_FAILURE);
3785 /* Mark everything as slave, so that we still
3786 * receive mounts from the real root, but don't
3787 * propagate mounts to the real root. */
3788 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3789 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3790 _exit(EXIT_FAILURE);
3793 if (mount_devices(arg_directory,
3794 root_device, root_device_rw,
3795 home_device, home_device_rw,
3796 srv_device, srv_device_rw) < 0)
3797 _exit(EXIT_FAILURE);
3799 /* Turn directory into bind mount */
3800 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3801 log_error_errno(errno, "Failed to make bind mount: %m");
3802 _exit(EXIT_FAILURE);
3805 r = setup_volatile(arg_directory);
3807 _exit(EXIT_FAILURE);
3809 if (setup_volatile_state(arg_directory) < 0)
3810 _exit(EXIT_FAILURE);
3812 r = base_filesystem_create(arg_directory);
3814 _exit(EXIT_FAILURE);
3816 if (arg_read_only) {
3817 r = bind_remount_recursive(arg_directory, true);
3819 log_error_errno(r, "Failed to make tree read-only: %m");
3820 _exit(EXIT_FAILURE);
3824 if (mount_all(arg_directory) < 0)
3825 _exit(EXIT_FAILURE);
3827 if (copy_devnodes(arg_directory) < 0)
3828 _exit(EXIT_FAILURE);
3830 if (setup_ptmx(arg_directory) < 0)
3831 _exit(EXIT_FAILURE);
3833 dev_setup(arg_directory);
3835 if (setup_propagate(arg_directory) < 0)
3836 _exit(EXIT_FAILURE);
3838 if (setup_seccomp() < 0)
3839 _exit(EXIT_FAILURE);
3841 if (setup_dev_console(arg_directory, console) < 0)
3842 _exit(EXIT_FAILURE);
3844 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3845 _exit(EXIT_FAILURE);
3846 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3848 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3849 _exit(EXIT_FAILURE);
3850 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3852 /* Tell the parent that we are ready, and that
3853 * it can cgroupify us to that we lack access
3854 * to certain devices and resources. */
3855 (void) barrier_place(&barrier);
3857 if (setup_boot_id(arg_directory) < 0)
3858 _exit(EXIT_FAILURE);
3860 if (setup_timezone(arg_directory) < 0)
3861 _exit(EXIT_FAILURE);
3863 if (setup_resolv_conf(arg_directory) < 0)
3864 _exit(EXIT_FAILURE);
3866 if (setup_journal(arg_directory) < 0)
3867 _exit(EXIT_FAILURE);
3869 if (mount_binds(arg_directory, arg_bind, false) < 0)
3870 _exit(EXIT_FAILURE);
3872 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3873 _exit(EXIT_FAILURE);
3875 if (mount_tmpfs(arg_directory) < 0)
3876 _exit(EXIT_FAILURE);
3878 /* Wait until we are cgroup-ified, so that we
3879 * can mount the right cgroup path writable */
3880 (void) barrier_sync_next(&barrier);
3882 if (mount_cgroup(arg_directory) < 0)
3883 _exit(EXIT_FAILURE);
3885 if (chdir(arg_directory) < 0) {
3886 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3887 _exit(EXIT_FAILURE);
3890 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3891 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3892 _exit(EXIT_FAILURE);
3895 if (chroot(".") < 0) {
3896 log_error_errno(errno, "chroot() failed: %m");
3897 _exit(EXIT_FAILURE);
3900 if (chdir("/") < 0) {
3901 log_error_errno(errno, "chdir() failed: %m");
3902 _exit(EXIT_FAILURE);
3907 if (arg_private_network)
3910 if (drop_capabilities() < 0) {
3911 log_error_errno(errno, "drop_capabilities() failed: %m");
3912 _exit(EXIT_FAILURE);
3915 r = change_uid_gid(&home);
3917 _exit(EXIT_FAILURE);
3919 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3920 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3921 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3923 _exit(EXIT_FAILURE);
3926 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3929 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3931 _exit(EXIT_FAILURE);
3935 if (fdset_size(fds) > 0) {
3936 r = fdset_cloexec(fds, false);
3938 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3939 _exit(EXIT_FAILURE);
3942 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3943 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3945 _exit(EXIT_FAILURE);
3951 if (arg_personality != 0xffffffffLU) {
3952 if (personality(arg_personality) < 0) {
3953 log_error_errno(errno, "personality() failed: %m");
3954 _exit(EXIT_FAILURE);
3956 } else if (secondary) {
3957 if (personality(PER_LINUX32) < 0) {
3958 log_error_errno(errno, "personality() failed: %m");
3959 _exit(EXIT_FAILURE);
3964 if (arg_selinux_context)
3965 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3966 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3967 _exit(EXIT_FAILURE);
3971 if (!strv_isempty(arg_setenv)) {
3974 n = strv_env_merge(2, envp, arg_setenv);
3977 _exit(EXIT_FAILURE);
3982 env_use = (char**) envp;
3984 /* Wait until the parent is ready with the setup, too... */
3985 if (!barrier_place_and_sync(&barrier))
3986 _exit(EXIT_FAILURE);
3992 /* Automatically search for the init system */
3994 l = 1 + argc - optind;
3995 a = newa(char*, l + 1);
3996 memcpy(a + 1, argv + optind, l * sizeof(char*));
3998 a[0] = (char*) "/usr/lib/systemd/systemd";
3999 execve(a[0], a, env_use);
4001 a[0] = (char*) "/lib/systemd/systemd";
4002 execve(a[0], a, env_use);
4004 a[0] = (char*) "/sbin/init";
4005 execve(a[0], a, env_use);
4006 } else if (argc > optind)
4007 execvpe(argv[optind], argv + optind, env_use);
4009 chdir(home ? home : "/root");
4010 execle("/bin/bash", "-bash", NULL, env_use);
4011 execle("/bin/sh", "-sh", NULL, env_use);
4014 log_error_errno(errno, "execv() failed: %m");
4015 _exit(EXIT_FAILURE);
4018 barrier_set_role(&barrier, BARRIER_PARENT);
4022 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4023 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4025 /* Wait for the most basic Child-setup to be done,
4026 * before we add hardware to it, and place it in a
4028 if (barrier_sync_next(&barrier)) {
4031 r = move_network_interfaces(pid);
4035 r = setup_veth(pid, veth_name, &ifi);
4039 r = setup_bridge(veth_name, &ifi);
4043 r = setup_macvlan(pid);
4047 r = register_machine(pid, ifi);
4051 /* Block SIGCHLD here, before notifying child.
4052 * process_pty() will handle it with the other signals. */
4053 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4057 /* Reset signal to default */
4058 r = default_signals(SIGCHLD, -1);
4062 /* Notify the child that the parent is ready with all
4063 * its setup, and that the child can now hand over
4064 * control to the code to run inside the container. */
4065 (void) barrier_place(&barrier);
4067 /* And wait that the child is completely ready now. */
4068 if (barrier_place_and_sync(&barrier)) {
4069 _cleanup_event_unref_ sd_event *event = NULL;
4070 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4071 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4076 "STATUS=Container running.\n"
4077 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4079 r = sd_event_new(&event);
4081 log_error_errno(r, "Failed to get default event source: %m");
4086 /* Try to kill the init system on SIGINT or SIGTERM */
4087 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4088 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4090 /* Immediately exit */
4091 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4092 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4095 /* simply exit on sigchld */
4096 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4098 if (arg_expose_ports) {
4099 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4103 (void) expose_ports(rtnl, &exposed);
4106 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4108 r = pty_forward_new(event, master, true, &forward);
4110 log_error_errno(r, "Failed to create PTY forwarder: %m");
4114 r = sd_event_loop(event);
4116 log_error_errno(r, "Failed to run event loop: %m");
4120 pty_forward_get_last_char(forward, &last_char);
4122 forward = pty_forward_free(forward);
4124 if (!arg_quiet && last_char != '\n')
4127 /* Kill if it is not dead yet anyway */
4128 terminate_machine(pid);
4132 /* Normally redundant, but better safe than sorry */
4135 r = wait_for_container(pid, &container_status);
4139 /* We failed to wait for the container, or the
4140 * container exited abnormally */
4142 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4143 /* The container exited with a non-zero
4144 * status, or with zero status and no reboot
4150 /* CONTAINER_REBOOTED, loop again */
4152 if (arg_keep_unit) {
4153 /* Special handling if we are running as a
4154 * service: instead of simply restarting the
4155 * machine we want to restart the entire
4156 * service, so let's inform systemd about this
4157 * with the special exit code 133. The service
4158 * file uses RestartForceExitStatus=133 so
4159 * that this results in a full nspawn
4160 * restart. This is necessary since we might
4161 * have cgroup parameters set we want to have
4168 flush_ports(&exposed);
4174 "STATUS=Terminating...");
4176 loop_remove(loop_nr, &image_fd);
4181 if (remove_subvol && arg_directory) {
4184 k = btrfs_subvol_remove(arg_directory);
4186 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4192 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4193 (void) rm_rf(p, false, true, false);
4196 free(arg_directory);
4201 strv_free(arg_setenv);
4202 strv_free(arg_network_interfaces);
4203 strv_free(arg_network_macvlan);
4204 strv_free(arg_bind);
4205 strv_free(arg_bind_ro);
4206 strv_free(arg_tmpfs);
4208 flush_ports(&exposed);
4210 while (arg_expose_ports) {
4211 ExposePort *p = arg_expose_ports;
4212 LIST_REMOVE(ports, arg_expose_ports, p);
4216 return r < 0 ? EXIT_FAILURE : ret;