1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
36 #include <sys/signalfd.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
50 #include <selinux/selinux.h>
58 #include <blkid/blkid.h>
61 #include "sd-daemon.h"
71 #include "cgroup-util.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
80 #include "bus-error.h"
82 #include "bus-kernel.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
89 #include "siphash24.h"
91 #include "base-filesystem.h"
93 #include "event-util.h"
94 #include "capability.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
99 #include "in-addr-util.h"
101 #include "local-addresses.h"
104 #include "seccomp-util.h"
107 typedef struct ExposePort {
110 uint16_t container_port;
111 LIST_FIELDS(struct ExposePort, ports);
114 typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
119 typedef enum LinkJournal {
126 typedef enum Volatile {
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147 (1ULL << CAP_CHOWN) |
148 (1ULL << CAP_DAC_OVERRIDE) |
149 (1ULL << CAP_DAC_READ_SEARCH) |
150 (1ULL << CAP_FOWNER) |
151 (1ULL << CAP_FSETID) |
152 (1ULL << CAP_IPC_OWNER) |
154 (1ULL << CAP_LEASE) |
155 (1ULL << CAP_LINUX_IMMUTABLE) |
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
159 (1ULL << CAP_SETGID) |
160 (1ULL << CAP_SETFCAP) |
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
164 (1ULL << CAP_SYS_CHROOT) |
165 (1ULL << CAP_SYS_NICE) |
166 (1ULL << CAP_SYS_PTRACE) |
167 (1ULL << CAP_SYS_TTY_CONFIG) |
168 (1ULL << CAP_SYS_RESOURCE) |
169 (1ULL << CAP_SYS_BOOT) |
170 (1ULL << CAP_AUDIT_WRITE) |
171 (1ULL << CAP_AUDIT_CONTROL) |
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static bool arg_network_veth = false;
184 static const char *arg_network_bridge = NULL;
185 static unsigned long arg_personality = 0xffffffffLU;
186 static char *arg_image = NULL;
187 static Volatile arg_volatile = VOLATILE_NO;
188 static ExposePort *arg_expose_ports = NULL;
190 static void help(void) {
191 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
192 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
193 " -h --help Show this help\n"
194 " --version Print version string\n"
195 " -q --quiet Do not show status information\n"
196 " -D --directory=PATH Root directory for the container\n"
197 " --template=PATH Initialize root directory from template directory,\n"
199 " -x --ephemeral Run container with snapshot of root directory, and\n"
200 " remove it after exit\n"
201 " -i --image=PATH File system device or disk image for the container\n"
202 " -b --boot Boot up full system (i.e. invoke init)\n"
203 " -u --user=USER Run the command under specified user or uid\n"
204 " -M --machine=NAME Set the machine name for the container\n"
205 " --uuid=UUID Set a specific machine UUID for the container\n"
206 " -S --slice=SLICE Place the container in the specified slice\n"
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " -n --network-veth Add a virtual ethernet connection between host\n"
216 " --network-bridge=INTERFACE\n"
217 " Add a virtual ethernet connection between host\n"
218 " and container and add it to an existing bridge on\n"
220 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
221 " Expose a container IP port on the host\n"
222 " -Z --selinux-context=SECLABEL\n"
223 " Set the SELinux security context to be used by\n"
224 " processes in the container\n"
225 " -L --selinux-apifs-context=SECLABEL\n"
226 " Set the SELinux security context to be used by\n"
227 " API/tmpfs file systems in the container\n"
228 " --capability=CAP In addition to the default, retain specified\n"
230 " --drop-capability=CAP Drop the specified capability from the default set\n"
231 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
232 " try-guest, try-host\n"
233 " -j Equivalent to --link-journal=try-guest\n"
234 " --read-only Mount the root directory read-only\n"
235 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
237 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
238 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
239 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
240 " --share-system Share system namespaces with host\n"
241 " --register=BOOLEAN Register container as machine\n"
242 " --keep-unit Do not register a scope for the machine, reuse\n"
243 " the service unit nspawn is running in\n"
244 " --volatile[=MODE] Run the system in volatile mode\n"
245 , program_invocation_short_name);
248 static int set_sanitized_path(char **b, const char *path) {
254 p = canonicalize_file_name(path);
259 p = path_make_absolute_cwd(path);
265 *b = path_kill_slashes(p);
269 static int parse_argv(int argc, char *argv[]) {
286 ARG_NETWORK_INTERFACE,
294 static const struct option options[] = {
295 { "help", no_argument, NULL, 'h' },
296 { "version", no_argument, NULL, ARG_VERSION },
297 { "directory", required_argument, NULL, 'D' },
298 { "template", required_argument, NULL, ARG_TEMPLATE },
299 { "ephemeral", no_argument, NULL, 'x' },
300 { "user", required_argument, NULL, 'u' },
301 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
302 { "boot", no_argument, NULL, 'b' },
303 { "uuid", required_argument, NULL, ARG_UUID },
304 { "read-only", no_argument, NULL, ARG_READ_ONLY },
305 { "capability", required_argument, NULL, ARG_CAPABILITY },
306 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
307 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
308 { "bind", required_argument, NULL, ARG_BIND },
309 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
310 { "tmpfs", required_argument, NULL, ARG_TMPFS },
311 { "machine", required_argument, NULL, 'M' },
312 { "slice", required_argument, NULL, 'S' },
313 { "setenv", required_argument, NULL, ARG_SETENV },
314 { "selinux-context", required_argument, NULL, 'Z' },
315 { "selinux-apifs-context", required_argument, NULL, 'L' },
316 { "quiet", no_argument, NULL, 'q' },
317 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
318 { "register", required_argument, NULL, ARG_REGISTER },
319 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
320 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
321 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
322 { "network-veth", no_argument, NULL, 'n' },
323 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
324 { "personality", required_argument, NULL, ARG_PERSONALITY },
325 { "image", required_argument, NULL, 'i' },
326 { "volatile", optional_argument, NULL, ARG_VOLATILE },
327 { "port", required_argument, NULL, 'p' },
332 uint64_t plus = 0, minus = 0;
337 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
346 puts(PACKAGE_STRING);
347 puts(SYSTEMD_FEATURES);
351 r = set_sanitized_path(&arg_directory, optarg);
353 return log_error_errno(r, "Invalid root directory: %m");
358 r = set_sanitized_path(&arg_template, optarg);
360 return log_error_errno(r, "Invalid template directory: %m");
365 r = set_sanitized_path(&arg_image, optarg);
367 return log_error_errno(r, "Invalid image path: %m");
372 arg_ephemeral = true;
377 arg_user = strdup(optarg);
383 case ARG_NETWORK_BRIDGE:
384 arg_network_bridge = optarg;
389 arg_network_veth = true;
390 arg_private_network = true;
393 case ARG_NETWORK_INTERFACE:
394 if (strv_extend(&arg_network_interfaces, optarg) < 0)
397 arg_private_network = true;
400 case ARG_NETWORK_MACVLAN:
401 if (strv_extend(&arg_network_macvlan, optarg) < 0)
406 case ARG_PRIVATE_NETWORK:
407 arg_private_network = true;
415 r = sd_id128_from_string(optarg, &arg_uuid);
417 log_error("Invalid UUID: %s", optarg);
427 if (isempty(optarg)) {
431 if (!machine_name_is_valid(optarg)) {
432 log_error("Invalid machine name: %s", optarg);
436 r = free_and_strdup(&arg_machine, optarg);
444 arg_selinux_context = optarg;
448 arg_selinux_apifs_context = optarg;
452 arg_read_only = true;
456 case ARG_DROP_CAPABILITY: {
457 const char *state, *word;
460 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
461 _cleanup_free_ char *t;
463 t = strndup(word, length);
467 if (streq(t, "all")) {
468 if (c == ARG_CAPABILITY)
469 plus = (uint64_t) -1;
471 minus = (uint64_t) -1;
475 cap = capability_from_name(t);
477 log_error("Failed to parse capability %s.", t);
481 if (c == ARG_CAPABILITY)
482 plus |= 1ULL << (uint64_t) cap;
484 minus |= 1ULL << (uint64_t) cap;
492 arg_link_journal = LINK_GUEST;
493 arg_link_journal_try = true;
496 case ARG_LINK_JOURNAL:
497 if (streq(optarg, "auto")) {
498 arg_link_journal = LINK_AUTO;
499 arg_link_journal_try = false;
500 } else if (streq(optarg, "no")) {
501 arg_link_journal = LINK_NO;
502 arg_link_journal_try = false;
503 } else if (streq(optarg, "guest")) {
504 arg_link_journal = LINK_GUEST;
505 arg_link_journal_try = false;
506 } else if (streq(optarg, "host")) {
507 arg_link_journal = LINK_HOST;
508 arg_link_journal_try = false;
509 } else if (streq(optarg, "try-guest")) {
510 arg_link_journal = LINK_GUEST;
511 arg_link_journal_try = true;
512 } else if (streq(optarg, "try-host")) {
513 arg_link_journal = LINK_HOST;
514 arg_link_journal_try = true;
516 log_error("Failed to parse link journal mode %s", optarg);
524 _cleanup_free_ char *a = NULL, *b = NULL;
528 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
530 e = strchr(optarg, ':');
532 a = strndup(optarg, e - optarg);
542 if (!path_is_absolute(a) || !path_is_absolute(b)) {
543 log_error("Invalid bind mount specification: %s", optarg);
547 r = strv_extend(x, a);
551 r = strv_extend(x, b);
559 _cleanup_free_ char *a = NULL, *b = NULL;
562 e = strchr(optarg, ':');
564 a = strndup(optarg, e - optarg);
568 b = strdup("mode=0755");
574 if (!path_is_absolute(a)) {
575 log_error("Invalid tmpfs specification: %s", optarg);
579 r = strv_push(&arg_tmpfs, a);
585 r = strv_push(&arg_tmpfs, b);
597 if (!env_assignment_is_valid(optarg)) {
598 log_error("Environment variable assignment '%s' is not valid.", optarg);
602 n = strv_env_set(arg_setenv, optarg);
606 strv_free(arg_setenv);
615 case ARG_SHARE_SYSTEM:
616 arg_share_system = true;
620 r = parse_boolean(optarg);
622 log_error("Failed to parse --register= argument: %s", optarg);
630 arg_keep_unit = true;
633 case ARG_PERSONALITY:
635 arg_personality = personality_from_string(optarg);
636 if (arg_personality == 0xffffffffLU) {
637 log_error("Unknown or unsupported personality '%s'.", optarg);
646 arg_volatile = VOLATILE_YES;
648 r = parse_boolean(optarg);
650 if (streq(optarg, "state"))
651 arg_volatile = VOLATILE_STATE;
653 log_error("Failed to parse --volatile= argument: %s", optarg);
657 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
663 const char *split, *e;
664 uint16_t container_port, host_port;
668 if ((e = startswith(optarg, "tcp:")))
669 protocol = IPPROTO_TCP;
670 else if ((e = startswith(optarg, "udp:")))
671 protocol = IPPROTO_UDP;
674 protocol = IPPROTO_TCP;
677 split = strchr(e, ':');
679 char v[split - e + 1];
681 memcpy(v, e, split - e);
684 r = safe_atou16(v, &host_port);
685 if (r < 0 || host_port <= 0) {
686 log_error("Failed to parse host port: %s", optarg);
690 r = safe_atou16(split + 1, &container_port);
692 r = safe_atou16(e, &container_port);
693 host_port = container_port;
696 if (r < 0 || container_port <= 0) {
697 log_error("Failed to parse host port: %s", optarg);
701 LIST_FOREACH(ports, p, arg_expose_ports) {
702 if (p->protocol == protocol && p->host_port == host_port) {
703 log_error("Duplicate port specification: %s", optarg);
708 p = new(ExposePort, 1);
712 p->protocol = protocol;
713 p->host_port = host_port;
714 p->container_port = container_port;
716 LIST_PREPEND(ports, arg_expose_ports, p);
725 assert_not_reached("Unhandled option");
728 if (arg_share_system)
729 arg_register = false;
731 if (arg_boot && arg_share_system) {
732 log_error("--boot and --share-system may not be combined.");
736 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
737 log_error("--keep-unit may not be used when invoked from a user session.");
741 if (arg_directory && arg_image) {
742 log_error("--directory= and --image= may not be combined.");
746 if (arg_template && arg_image) {
747 log_error("--template= and --image= may not be combined.");
751 if (arg_template && !(arg_directory || arg_machine)) {
752 log_error("--template= needs --directory= or --machine=.");
756 if (arg_ephemeral && arg_template) {
757 log_error("--ephemeral and --template= may not be combined.");
761 if (arg_ephemeral && arg_image) {
762 log_error("--ephemeral and --image= may not be combined.");
766 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
767 log_error("--ephemeral and --link-journal= may not be combined.");
771 if (arg_volatile != VOLATILE_NO && arg_read_only) {
772 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
776 if (arg_expose_ports && !arg_private_network) {
777 log_error("Cannot use --port= without private networking.");
781 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
786 static int mount_all(const char *dest) {
788 typedef struct MountPoint {
797 static const MountPoint mount_table[] = {
798 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
799 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
800 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
801 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
802 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
803 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
804 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
805 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
807 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
808 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
815 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
816 _cleanup_free_ char *where = NULL;
818 _cleanup_free_ char *options = NULL;
823 where = strjoin(dest, "/", mount_table[k].where, NULL);
827 t = path_is_mount_point(where, true);
829 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
837 /* Skip this entry if it is not a remount. */
838 if (mount_table[k].what && t > 0)
841 t = mkdir_p(where, 0755);
843 if (mount_table[k].fatal) {
844 log_error_errno(t, "Failed to create directory %s: %m", where);
849 log_warning_errno(t, "Failed to create directory %s: %m", where);
855 if (arg_selinux_apifs_context &&
856 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
857 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
864 o = mount_table[k].options;
867 if (mount(mount_table[k].what,
870 mount_table[k].flags,
873 if (mount_table[k].fatal) {
874 log_error_errno(errno, "mount(%s) failed: %m", where);
879 log_warning_errno(errno, "mount(%s) failed: %m", where);
886 static int mount_binds(const char *dest, char **l, bool ro) {
889 STRV_FOREACH_PAIR(x, y, l) {
890 _cleanup_free_ char *where = NULL;
891 struct stat source_st, dest_st;
894 if (stat(*x, &source_st) < 0)
895 return log_error_errno(errno, "Failed to stat %s: %m", *x);
897 where = strappend(dest, *y);
901 r = stat(where, &dest_st);
903 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
904 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
907 } else if (errno == ENOENT) {
908 r = mkdir_parents_label(where, 0755);
910 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
912 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
916 /* Create the mount point, but be conservative -- refuse to create block
917 * and char devices. */
918 if (S_ISDIR(source_st.st_mode)) {
919 r = mkdir_label(where, 0755);
920 if (r < 0 && errno != EEXIST)
921 return log_error_errno(r, "Failed to create mount point %s: %m", where);
922 } else if (S_ISFIFO(source_st.st_mode)) {
923 r = mkfifo(where, 0644);
924 if (r < 0 && errno != EEXIST)
925 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
926 } else if (S_ISSOCK(source_st.st_mode)) {
927 r = mknod(where, 0644 | S_IFSOCK, 0);
928 if (r < 0 && errno != EEXIST)
929 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
930 } else if (S_ISREG(source_st.st_mode)) {
933 return log_error_errno(r, "Failed to create mount point %s: %m", where);
935 log_error("Refusing to create mountpoint for file: %s", *x);
939 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
940 return log_error_errno(errno, "mount(%s) failed: %m", where);
943 r = bind_remount_recursive(where, true);
945 return log_error_errno(r, "Read-Only bind mount failed: %m");
952 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
956 to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
958 r = path_is_mount_point(to, false);
960 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
966 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
967 return log_error_errno(errno, "Failed to mount to %s: %m", to);
972 static int mount_cgroup(const char *dest) {
973 _cleanup_set_free_free_ Set *controllers = NULL;
974 _cleanup_free_ char *own_cgroup_path = NULL;
975 const char *cgroup_root, *systemd_root, *systemd_own;
978 controllers = set_new(&string_hash_ops);
982 r = cg_kernel_controllers(controllers);
984 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
986 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
988 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
990 cgroup_root = strappenda(dest, "/sys/fs/cgroup");
991 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
992 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
995 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
997 controller = set_steal_first(controllers);
1001 origin = strappend("/sys/fs/cgroup/", controller);
1005 r = readlink_malloc(origin, &combined);
1007 /* Not a symbolic link, but directly a single cgroup hierarchy */
1009 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1014 return log_error_errno(r, "Failed to read link %s: %m", origin);
1016 _cleanup_free_ char *target = NULL;
1018 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1022 /* A symbolic link, a combination of controllers in one hierarchy */
1024 if (!filename_is_valid(combined)) {
1025 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1029 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1033 if (symlink(combined, target) < 0)
1034 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1038 r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1042 /* Make our own cgroup a (writable) bind mount */
1043 systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1044 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1045 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1047 /* And then remount the systemd cgroup root read-only */
1048 systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1049 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1050 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1052 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1053 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1058 static int mount_tmpfs(const char *dest) {
1061 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1062 _cleanup_free_ char *where = NULL;
1065 where = strappend(dest, *i);
1069 r = mkdir_label(where, 0755);
1070 if (r < 0 && r != -EEXIST)
1071 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1073 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1074 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1080 static int setup_timezone(const char *dest) {
1081 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1087 /* Fix the timezone, if possible */
1088 r = readlink_malloc("/etc/localtime", &p);
1090 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1094 z = path_startswith(p, "../usr/share/zoneinfo/");
1096 z = path_startswith(p, "/usr/share/zoneinfo/");
1098 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1102 where = strappend(dest, "/etc/localtime");
1106 r = readlink_malloc(where, &q);
1108 y = path_startswith(q, "../usr/share/zoneinfo/");
1110 y = path_startswith(q, "/usr/share/zoneinfo/");
1112 /* Already pointing to the right place? Then do nothing .. */
1113 if (y && streq(y, z))
1117 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1121 if (access(check, F_OK) < 0) {
1122 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1126 what = strappend("../usr/share/zoneinfo/", z);
1130 r = mkdir_parents(where, 0755);
1132 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1138 if (r < 0 && errno != ENOENT) {
1139 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1144 if (symlink(what, where) < 0) {
1145 log_error_errno(errno, "Failed to correct timezone of container: %m");
1152 static int setup_resolv_conf(const char *dest) {
1153 _cleanup_free_ char *where = NULL;
1158 if (arg_private_network)
1161 /* Fix resolv.conf, if possible */
1162 where = strappend(dest, "/etc/resolv.conf");
1166 /* We don't really care for the results of this really. If it
1167 * fails, it fails, but meh... */
1168 r = mkdir_parents(where, 0755);
1170 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1175 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1177 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1185 static int setup_volatile_state(const char *directory) {
1191 if (arg_volatile != VOLATILE_STATE)
1194 /* --volatile=state means we simply overmount /var
1195 with a tmpfs, and the rest read-only. */
1197 r = bind_remount_recursive(directory, true);
1199 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1201 p = strappenda(directory, "/var");
1203 if (r < 0 && errno != EEXIST)
1204 return log_error_errno(errno, "Failed to create %s: %m", directory);
1206 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1207 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1212 static int setup_volatile(const char *directory) {
1213 bool tmpfs_mounted = false, bind_mounted = false;
1214 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1220 if (arg_volatile != VOLATILE_YES)
1223 /* --volatile=yes means we mount a tmpfs to the root dir, and
1224 the original /usr to use inside it, and that read-only. */
1226 if (!mkdtemp(template))
1227 return log_error_errno(errno, "Failed to create temporary directory: %m");
1229 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1230 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1235 tmpfs_mounted = true;
1237 f = strappenda(directory, "/usr");
1238 t = strappenda(template, "/usr");
1241 if (r < 0 && errno != EEXIST) {
1242 log_error_errno(errno, "Failed to create %s: %m", t);
1247 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1248 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1253 bind_mounted = true;
1255 r = bind_remount_recursive(t, true);
1257 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1261 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1262 log_error_errno(errno, "Failed to move root mount: %m");
1280 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1283 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1284 SD_ID128_FORMAT_VAL(id));
1289 static int setup_boot_id(const char *dest) {
1290 _cleanup_free_ char *from = NULL, *to = NULL;
1291 sd_id128_t rnd = {};
1297 if (arg_share_system)
1300 /* Generate a new randomized boot ID, so that each boot-up of
1301 * the container gets a new one */
1303 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1304 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1308 r = sd_id128_randomize(&rnd);
1310 return log_error_errno(r, "Failed to generate random boot id: %m");
1312 id128_format_as_uuid(rnd, as_uuid);
1314 r = write_string_file(from, as_uuid);
1316 return log_error_errno(r, "Failed to write boot id: %m");
1318 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1319 log_error_errno(errno, "Failed to bind mount boot id: %m");
1321 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1322 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1328 static int copy_devnodes(const char *dest) {
1330 static const char devnodes[] =
1341 _cleanup_umask_ mode_t u;
1347 NULSTR_FOREACH(d, devnodes) {
1348 _cleanup_free_ char *from = NULL, *to = NULL;
1351 from = strappend("/dev/", d);
1352 to = strjoin(dest, "/dev/", d, NULL);
1356 if (stat(from, &st) < 0) {
1358 if (errno != ENOENT)
1359 return log_error_errno(errno, "Failed to stat %s: %m", from);
1361 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1363 log_error("%s is not a char or block device, cannot copy", from);
1367 r = mkdir_parents(to, 0775);
1369 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1373 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1374 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1381 static int setup_ptmx(const char *dest) {
1382 _cleanup_free_ char *p = NULL;
1384 p = strappend(dest, "/dev/ptmx");
1388 if (symlink("pts/ptmx", p) < 0)
1389 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1394 static int setup_dev_console(const char *dest, const char *console) {
1395 _cleanup_umask_ mode_t u;
1405 if (stat("/dev/null", &st) < 0)
1406 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1408 r = chmod_and_chown(console, 0600, 0, 0);
1410 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1412 /* We need to bind mount the right tty to /dev/console since
1413 * ptys can only exist on pts file systems. To have something
1414 * to bind mount things on we create a device node first, and
1415 * use /dev/null for that since we the cgroups device policy
1416 * allows us to create that freely, while we cannot create
1417 * /dev/console. (Note that the major minor doesn't actually
1418 * matter here, since we mount it over anyway). */
1420 to = strappenda(dest, "/dev/console");
1421 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1422 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1424 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1425 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1430 static int setup_kmsg(const char *dest, int kmsg_socket) {
1431 _cleanup_free_ char *from = NULL, *to = NULL;
1432 _cleanup_umask_ mode_t u;
1435 struct cmsghdr cmsghdr;
1436 uint8_t buf[CMSG_SPACE(sizeof(int))];
1438 struct msghdr mh = {
1439 .msg_control = &control,
1440 .msg_controllen = sizeof(control),
1442 struct cmsghdr *cmsg;
1445 assert(kmsg_socket >= 0);
1449 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1450 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1451 * on the reading side behave very similar to /proc/kmsg,
1452 * their writing side behaves differently from /dev/kmsg in
1453 * that writing blocks when nothing is reading. In order to
1454 * avoid any problems with containers deadlocking due to this
1455 * we simply make /dev/kmsg unavailable to the container. */
1456 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1457 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1460 if (mkfifo(from, 0600) < 0)
1461 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1463 r = chmod_and_chown(from, 0600, 0, 0);
1465 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1467 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1468 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1470 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1472 return log_error_errno(errno, "Failed to open fifo: %m");
1474 cmsg = CMSG_FIRSTHDR(&mh);
1475 cmsg->cmsg_level = SOL_SOCKET;
1476 cmsg->cmsg_type = SCM_RIGHTS;
1477 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1478 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1480 mh.msg_controllen = cmsg->cmsg_len;
1482 /* Store away the fd in the socket, so that it stays open as
1483 * long as we run the child */
1484 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1488 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1490 /* And now make the FIFO unavailable as /dev/kmsg... */
1495 static int send_rtnl(int send_fd) {
1497 struct cmsghdr cmsghdr;
1498 uint8_t buf[CMSG_SPACE(sizeof(int))];
1500 struct msghdr mh = {
1501 .msg_control = &control,
1502 .msg_controllen = sizeof(control),
1504 struct cmsghdr *cmsg;
1505 _cleanup_close_ int fd = -1;
1508 assert(send_fd >= 0);
1510 if (!arg_expose_ports)
1513 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1515 return log_error_errno(errno, "failed to allocate container netlink: %m");
1517 cmsg = CMSG_FIRSTHDR(&mh);
1518 cmsg->cmsg_level = SOL_SOCKET;
1519 cmsg->cmsg_type = SCM_RIGHTS;
1520 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1521 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1523 mh.msg_controllen = cmsg->cmsg_len;
1525 /* Store away the fd in the socket, so that it stays open as
1526 * long as we run the child */
1527 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1529 return log_error_errno(errno, "Failed to send netlink fd: %m");
1534 static int flush_ports(union in_addr_union *exposed) {
1536 int r, af = AF_INET;
1540 if (!arg_expose_ports)
1543 if (in_addr_is_null(af, exposed))
1546 log_debug("Lost IP address.");
1548 LIST_FOREACH(ports, p, arg_expose_ports) {
1549 r = fw_add_local_dnat(false,
1560 log_warning_errno(r, "Failed to modify firewall: %m");
1563 *exposed = IN_ADDR_NULL;
1567 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1568 _cleanup_free_ struct local_address *addresses = NULL;
1569 _cleanup_free_ char *pretty = NULL;
1570 union in_addr_union new_exposed;
1573 int af = AF_INET, r;
1577 /* Invoked each time an address is added or removed inside the
1580 if (!arg_expose_ports)
1583 r = local_addresses(rtnl, 0, af, &addresses);
1585 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1588 addresses[0].family == af &&
1589 addresses[0].scope < RT_SCOPE_LINK;
1592 return flush_ports(exposed);
1594 new_exposed = addresses[0].address;
1595 if (in_addr_equal(af, exposed, &new_exposed))
1598 in_addr_to_string(af, &new_exposed, &pretty);
1599 log_debug("New container IP is %s.", strna(pretty));
1601 LIST_FOREACH(ports, p, arg_expose_ports) {
1603 r = fw_add_local_dnat(true,
1612 in_addr_is_null(af, exposed) ? NULL : exposed);
1614 log_warning_errno(r, "Failed to modify firewall: %m");
1617 *exposed = new_exposed;
1621 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1622 union in_addr_union *exposed = userdata;
1628 expose_ports(rtnl, exposed);
1632 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1634 struct cmsghdr cmsghdr;
1635 uint8_t buf[CMSG_SPACE(sizeof(int))];
1637 struct msghdr mh = {
1638 .msg_control = &control,
1639 .msg_controllen = sizeof(control),
1641 struct cmsghdr *cmsg;
1642 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1647 assert(recv_fd >= 0);
1650 if (!arg_expose_ports)
1653 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1655 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1657 cmsg = CMSG_FIRSTHDR(&mh);
1658 assert(cmsg->cmsg_level == SOL_SOCKET);
1659 assert(cmsg->cmsg_type == SCM_RIGHTS);
1660 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1661 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1663 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1666 return log_error_errno(r, "Failed to create rtnl object: %m");
1669 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1671 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1673 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1675 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1677 r = sd_rtnl_attach_event(rtnl, event, 0);
1679 return log_error_errno(r, "Failed to add to even loop: %m");
1687 static int setup_hostname(void) {
1689 if (arg_share_system)
1692 if (sethostname_idempotent(arg_machine) < 0)
1698 static int setup_journal(const char *directory) {
1699 sd_id128_t machine_id, this_id;
1700 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1704 /* Don't link journals in ephemeral mode */
1708 p = strappend(directory, "/etc/machine-id");
1712 r = read_one_line_file(p, &b);
1713 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1716 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1719 if (isempty(id) && arg_link_journal == LINK_AUTO)
1722 /* Verify validity */
1723 r = sd_id128_from_string(id, &machine_id);
1725 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1727 r = sd_id128_get_machine(&this_id);
1729 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1731 if (sd_id128_equal(machine_id, this_id)) {
1732 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1733 "Host and machine ids are equal (%s): refusing to link journals", id);
1734 if (arg_link_journal == LINK_AUTO)
1739 if (arg_link_journal == LINK_NO)
1743 p = strappend("/var/log/journal/", id);
1744 q = strjoin(directory, "/var/log/journal/", id, NULL);
1748 if (path_is_mount_point(p, false) > 0) {
1749 if (arg_link_journal != LINK_AUTO) {
1750 log_error("%s: already a mount point, refusing to use for journal", p);
1757 if (path_is_mount_point(q, false) > 0) {
1758 if (arg_link_journal != LINK_AUTO) {
1759 log_error("%s: already a mount point, refusing to use for journal", q);
1766 r = readlink_and_make_absolute(p, &d);
1768 if ((arg_link_journal == LINK_GUEST ||
1769 arg_link_journal == LINK_AUTO) &&
1772 r = mkdir_p(q, 0755);
1774 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1779 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1780 } else if (r == -EINVAL) {
1782 if (arg_link_journal == LINK_GUEST &&
1785 if (errno == ENOTDIR) {
1786 log_error("%s already exists and is neither a symlink nor a directory", p);
1789 log_error_errno(errno, "Failed to remove %s: %m", p);
1793 } else if (r != -ENOENT) {
1794 log_error_errno(errno, "readlink(%s) failed: %m", p);
1798 if (arg_link_journal == LINK_GUEST) {
1800 if (symlink(q, p) < 0) {
1801 if (arg_link_journal_try) {
1802 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1805 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1810 r = mkdir_p(q, 0755);
1812 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1816 if (arg_link_journal == LINK_HOST) {
1817 /* don't create parents here -- if the host doesn't have
1818 * permanent journal set up, don't force it here */
1821 if (arg_link_journal_try) {
1822 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1825 log_error_errno(errno, "Failed to create %s: %m", p);
1830 } else if (access(p, F_OK) < 0)
1833 if (dir_is_empty(q) == 0)
1834 log_warning("%s is not empty, proceeding anyway.", q);
1836 r = mkdir_p(q, 0755);
1838 log_error_errno(errno, "Failed to create %s: %m", q);
1842 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1843 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1848 static int drop_capabilities(void) {
1849 return capability_bounding_set_drop(~arg_retain, false);
1852 static int register_machine(pid_t pid, int local_ifindex) {
1853 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1854 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1860 r = sd_bus_default_system(&bus);
1862 return log_error_errno(r, "Failed to open system bus: %m");
1864 if (arg_keep_unit) {
1865 r = sd_bus_call_method(
1867 "org.freedesktop.machine1",
1868 "/org/freedesktop/machine1",
1869 "org.freedesktop.machine1.Manager",
1870 "RegisterMachineWithNetwork",
1875 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1879 strempty(arg_directory),
1880 local_ifindex > 0 ? 1 : 0, local_ifindex);
1882 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1884 r = sd_bus_message_new_method_call(
1887 "org.freedesktop.machine1",
1888 "/org/freedesktop/machine1",
1889 "org.freedesktop.machine1.Manager",
1890 "CreateMachineWithNetwork");
1892 return log_error_errno(r, "Failed to create message: %m");
1894 r = sd_bus_message_append(
1898 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1902 strempty(arg_directory),
1903 local_ifindex > 0 ? 1 : 0, local_ifindex);
1905 return log_error_errno(r, "Failed to append message arguments: %m");
1907 r = sd_bus_message_open_container(m, 'a', "(sv)");
1909 return log_error_errno(r, "Failed to open container: %m");
1911 if (!isempty(arg_slice)) {
1912 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1914 return log_error_errno(r, "Failed to append slice: %m");
1917 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1919 return log_error_errno(r, "Failed to add device policy: %m");
1921 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1922 /* Allow the container to
1923 * access and create the API
1924 * device nodes, so that
1925 * PrivateDevices= in the
1926 * container can work
1931 "/dev/random", "rwm",
1932 "/dev/urandom", "rwm",
1934 "/dev/net/tun", "rwm",
1935 /* Allow the container
1936 * access to ptys. However,
1938 * container to ever create
1939 * these device nodes. */
1940 "/dev/pts/ptmx", "rw",
1943 return log_error_errno(r, "Failed to add device whitelist: %m");
1945 r = sd_bus_message_close_container(m);
1947 return log_error_errno(r, "Failed to close container: %m");
1949 r = sd_bus_call(bus, m, 0, &error, NULL);
1953 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1960 static int terminate_machine(pid_t pid) {
1961 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1962 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1963 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1970 r = sd_bus_default_system(&bus);
1972 return log_error_errno(r, "Failed to open system bus: %m");
1974 r = sd_bus_call_method(
1976 "org.freedesktop.machine1",
1977 "/org/freedesktop/machine1",
1978 "org.freedesktop.machine1.Manager",
1985 /* Note that the machine might already have been
1986 * cleaned up automatically, hence don't consider it a
1987 * failure if we cannot get the machine object. */
1988 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1992 r = sd_bus_message_read(reply, "o", &path);
1994 return bus_log_parse_error(r);
1996 r = sd_bus_call_method(
1998 "org.freedesktop.machine1",
2000 "org.freedesktop.machine1.Machine",
2006 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2013 static int reset_audit_loginuid(void) {
2014 _cleanup_free_ char *p = NULL;
2017 if (arg_share_system)
2020 r = read_one_line_file("/proc/self/loginuid", &p);
2024 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2026 /* Already reset? */
2027 if (streq(p, "4294967295"))
2030 r = write_string_file("/proc/self/loginuid", "4294967295");
2032 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2033 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2034 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2035 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2036 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2044 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2045 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2046 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2048 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2054 l = strlen(arg_machine);
2055 sz = sizeof(sd_id128_t) + l;
2061 /* fetch some persistent data unique to the host */
2062 r = sd_id128_get_machine((sd_id128_t*) v);
2066 /* combine with some data unique (on this host) to this
2067 * container instance */
2068 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2071 memcpy(i, &idx, sizeof(idx));
2074 /* Let's hash the host machine ID plus the container name. We
2075 * use a fixed, but originally randomly created hash key here. */
2076 siphash24(result, v, sz, hash_key.bytes);
2078 assert_cc(ETH_ALEN <= sizeof(result));
2079 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2081 /* see eth_random_addr in the kernel */
2082 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2083 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2088 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2089 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2090 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2091 struct ether_addr mac_host, mac_container;
2094 if (!arg_private_network)
2097 if (!arg_network_veth)
2100 /* Use two different interface name prefixes depending whether
2101 * we are in bridge mode or not. */
2102 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2103 arg_network_bridge ? "vb" : "ve", arg_machine);
2105 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2107 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2109 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2111 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2113 r = sd_rtnl_open(&rtnl, 0);
2115 return log_error_errno(r, "Failed to connect to netlink: %m");
2117 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2119 return log_error_errno(r, "Failed to allocate netlink message: %m");
2121 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2123 return log_error_errno(r, "Failed to add netlink interface name: %m");
2125 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2127 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2129 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2131 return log_error_errno(r, "Failed to open netlink container: %m");
2133 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2135 return log_error_errno(r, "Failed to open netlink container: %m");
2137 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2139 return log_error_errno(r, "Failed to open netlink container: %m");
2141 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2143 return log_error_errno(r, "Failed to add netlink interface name: %m");
2145 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2147 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2149 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2151 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2153 r = sd_rtnl_message_close_container(m);
2155 return log_error_errno(r, "Failed to close netlink container: %m");
2157 r = sd_rtnl_message_close_container(m);
2159 return log_error_errno(r, "Failed to close netlink container: %m");
2161 r = sd_rtnl_message_close_container(m);
2163 return log_error_errno(r, "Failed to close netlink container: %m");
2165 r = sd_rtnl_call(rtnl, m, 0, NULL);
2167 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2169 i = (int) if_nametoindex(iface_name);
2171 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2178 static int setup_bridge(const char veth_name[], int *ifi) {
2179 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2180 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2183 if (!arg_private_network)
2186 if (!arg_network_veth)
2189 if (!arg_network_bridge)
2192 bridge = (int) if_nametoindex(arg_network_bridge);
2194 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2198 r = sd_rtnl_open(&rtnl, 0);
2200 return log_error_errno(r, "Failed to connect to netlink: %m");
2202 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2204 return log_error_errno(r, "Failed to allocate netlink message: %m");
2206 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2208 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2210 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2212 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2214 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2216 return log_error_errno(r, "Failed to add netlink master field: %m");
2218 r = sd_rtnl_call(rtnl, m, 0, NULL);
2220 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2225 static int parse_interface(struct udev *udev, const char *name) {
2226 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2227 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2230 ifi = (int) if_nametoindex(name);
2232 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2234 sprintf(ifi_str, "n%i", ifi);
2235 d = udev_device_new_from_device_id(udev, ifi_str);
2237 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2239 if (udev_device_get_is_initialized(d) <= 0) {
2240 log_error("Network interface %s is not initialized yet.", name);
2247 static int move_network_interfaces(pid_t pid) {
2248 _cleanup_udev_unref_ struct udev *udev = NULL;
2249 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2253 if (!arg_private_network)
2256 if (strv_isempty(arg_network_interfaces))
2259 r = sd_rtnl_open(&rtnl, 0);
2261 return log_error_errno(r, "Failed to connect to netlink: %m");
2265 log_error("Failed to connect to udev.");
2269 STRV_FOREACH(i, arg_network_interfaces) {
2270 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2273 ifi = parse_interface(udev, *i);
2277 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2279 return log_error_errno(r, "Failed to allocate netlink message: %m");
2281 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2283 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2285 r = sd_rtnl_call(rtnl, m, 0, NULL);
2287 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2293 static int setup_macvlan(pid_t pid) {
2294 _cleanup_udev_unref_ struct udev *udev = NULL;
2295 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2300 if (!arg_private_network)
2303 if (strv_isempty(arg_network_macvlan))
2306 r = sd_rtnl_open(&rtnl, 0);
2308 return log_error_errno(r, "Failed to connect to netlink: %m");
2312 log_error("Failed to connect to udev.");
2316 STRV_FOREACH(i, arg_network_macvlan) {
2317 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2318 _cleanup_free_ char *n = NULL;
2319 struct ether_addr mac;
2322 ifi = parse_interface(udev, *i);
2326 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2328 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2330 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2332 return log_error_errno(r, "Failed to allocate netlink message: %m");
2334 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2336 return log_error_errno(r, "Failed to add netlink interface index: %m");
2338 n = strappend("mv-", *i);
2342 strshorten(n, IFNAMSIZ-1);
2344 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2346 return log_error_errno(r, "Failed to add netlink interface name: %m");
2348 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2350 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2352 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2354 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2356 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2358 return log_error_errno(r, "Failed to open netlink container: %m");
2360 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2362 return log_error_errno(r, "Failed to open netlink container: %m");
2364 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2366 return log_error_errno(r, "Failed to append macvlan mode: %m");
2368 r = sd_rtnl_message_close_container(m);
2370 return log_error_errno(r, "Failed to close netlink container: %m");
2372 r = sd_rtnl_message_close_container(m);
2374 return log_error_errno(r, "Failed to close netlink container: %m");
2376 r = sd_rtnl_call(rtnl, m, 0, NULL);
2378 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2384 static int setup_seccomp(void) {
2387 static const int blacklist[] = {
2388 SCMP_SYS(kexec_load),
2389 SCMP_SYS(open_by_handle_at),
2390 SCMP_SYS(init_module),
2391 SCMP_SYS(finit_module),
2392 SCMP_SYS(delete_module),
2399 scmp_filter_ctx seccomp;
2403 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2407 r = seccomp_add_secondary_archs(seccomp);
2409 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2413 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2414 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2416 continue; /* unknown syscall */
2418 log_error_errno(r, "Failed to block syscall: %m");
2424 Audit is broken in containers, much of the userspace audit
2425 hookup will fail if running inside a container. We don't
2426 care and just turn off creation of audit sockets.
2428 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2429 with EAFNOSUPPORT which audit userspace uses as indication
2430 that audit is disabled in the kernel.
2433 r = seccomp_rule_add(
2435 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2438 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2439 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2441 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2445 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2447 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2451 r = seccomp_load(seccomp);
2453 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2456 seccomp_release(seccomp);
2464 static int setup_propagate(const char *root) {
2467 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2468 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2469 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2470 (void) mkdir_p(p, 0600);
2472 q = strappenda(root, "/run/systemd/nspawn/incoming");
2473 mkdir_parents(q, 0755);
2476 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2477 return log_error_errno(errno, "Failed to install propagation bind mount.");
2479 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2480 return log_error_errno(errno, "Failed to make propagation mount read-only");
2485 static int setup_image(char **device_path, int *loop_nr) {
2486 struct loop_info64 info = {
2487 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2489 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2490 _cleanup_free_ char* loopdev = NULL;
2494 assert(device_path);
2498 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2500 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2502 if (fstat(fd, &st) < 0)
2503 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2505 if (S_ISBLK(st.st_mode)) {
2508 p = strdup(arg_image);
2522 if (!S_ISREG(st.st_mode)) {
2523 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2527 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2529 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2531 nr = ioctl(control, LOOP_CTL_GET_FREE);
2533 return log_error_errno(errno, "Failed to allocate loop device: %m");
2535 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2538 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2540 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2542 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2543 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2546 info.lo_flags |= LO_FLAGS_READ_ONLY;
2548 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2549 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2551 *device_path = loopdev;
2562 static int wait_for_block_device(struct udev *udev, dev_t devnum, struct udev_device **ret) {
2563 _cleanup_udev_monitor_unref_ struct udev_monitor *monitor = NULL;
2570 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2571 struct pollfd pfd = {
2575 d = udev_device_new_from_devnum(udev, 'b', devnum);
2579 r = udev_device_get_is_initialized(d);
2581 return log_error_errno(r, "Failed to check if device is initialized: %m");
2587 d = udev_device_unref(d);
2590 monitor = udev_monitor_new_from_netlink(udev, "udev");
2594 r = udev_monitor_filter_add_match_subsystem_devtype(monitor, "block", NULL);
2596 return log_error_errno(r, "Failed to add block match: %m");
2598 r = udev_monitor_enable_receiving(monitor);
2600 return log_error_errno(r, "Failed to turn on monitor: %m");
2605 pfd.fd = udev_monitor_get_fd(monitor);
2607 return log_error_errno(r, "Failed to get udev monitor fd: %m");
2609 r = poll(&pfd, 1, -1);
2611 return log_error_errno(errno, "Failed to wait for device initialization: %m");
2613 d = udev_monitor_receive_device(monitor);
2619 #define PARTITION_TABLE_BLURB \
2620 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2621 "type 0x83 that is marked bootable, or follow\n" \
2622 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2623 "to be bootable with systemd-nspawn."
2625 static int dissect_image(
2627 char **root_device, bool *root_device_rw,
2628 char **home_device, bool *home_device_rw,
2629 char **srv_device, bool *srv_device_rw,
2633 int home_nr = -1, srv_nr = -1;
2634 #ifdef GPT_ROOT_NATIVE
2637 #ifdef GPT_ROOT_SECONDARY
2638 int secondary_root_nr = -1;
2641 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2642 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2643 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2644 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2645 _cleanup_udev_unref_ struct udev *udev = NULL;
2646 struct udev_list_entry *first, *item;
2647 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2648 const char *pttype = NULL;
2652 bool is_gpt, is_mbr;
2655 assert(root_device);
2656 assert(home_device);
2661 b = blkid_new_probe();
2666 r = blkid_probe_set_device(b, fd, 0, 0);
2671 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2675 blkid_probe_enable_partitions(b, 1);
2676 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2679 r = blkid_do_safeprobe(b);
2680 if (r == -2 || r == 1) {
2681 log_error("Failed to identify any partition table on\n"
2683 PARTITION_TABLE_BLURB, arg_image);
2685 } else if (r != 0) {
2688 log_error_errno(errno, "Failed to probe: %m");
2692 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2694 is_gpt = streq_ptr(pttype, "gpt");
2695 is_mbr = streq_ptr(pttype, "dos");
2697 if (!is_gpt && !is_mbr) {
2698 log_error("No GPT or MBR partition table discovered on\n"
2700 PARTITION_TABLE_BLURB, arg_image);
2705 pl = blkid_probe_get_partitions(b);
2710 log_error("Failed to list partitions of %s", arg_image);
2718 if (fstat(fd, &st) < 0)
2719 return log_error_errno(errno, "Failed to stat block device: %m");
2721 r = wait_for_block_device(udev, st.st_rdev, &d);
2725 e = udev_enumerate_new(udev);
2729 r = udev_enumerate_add_match_parent(e, d);
2733 r = udev_enumerate_scan_devices(e);
2735 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2737 first = udev_enumerate_get_list_entry(e);
2738 udev_list_entry_foreach(item, first) {
2739 _cleanup_udev_device_unref_ struct udev_device *q;
2741 unsigned long long flags;
2747 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2752 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2756 qn = udev_device_get_devnum(q);
2760 if (st.st_rdev == qn)
2763 node = udev_device_get_devnode(q);
2767 pp = blkid_partlist_devno_to_partition(pl, qn);
2771 flags = blkid_partition_get_flags(pp);
2772 if (is_gpt && (flags & GPT_FLAG_NO_AUTO))
2774 if (is_mbr && (flags != 0x80)) /* Bootable flag */
2777 nr = blkid_partition_get_partno(pp);
2785 stype = blkid_partition_get_type_string(pp);
2789 if (sd_id128_from_string(stype, &type_id) < 0)
2792 if (sd_id128_equal(type_id, GPT_HOME)) {
2794 if (home && nr >= home_nr)
2798 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2800 r = free_and_strdup(&home, node);
2804 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2806 if (srv && nr >= srv_nr)
2810 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2812 r = free_and_strdup(&srv, node);
2816 #ifdef GPT_ROOT_NATIVE
2817 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2819 if (root && nr >= root_nr)
2823 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2825 r = free_and_strdup(&root, node);
2830 #ifdef GPT_ROOT_SECONDARY
2831 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2833 if (secondary_root && nr >= secondary_root_nr)
2836 secondary_root_nr = nr;
2837 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2839 r = free_and_strdup(&secondary_root, node);
2845 } else if (is_mbr) {
2848 type = blkid_partition_get_type(pp);
2849 if (type != 0x83) /* Linux partition */
2852 /* Note that there's a certain, intended
2853 * asymmetry here: while for GPT we simply
2854 * take the first valid partition and ignore
2855 * all others of the same type, for MBR we
2856 * fail if there are multiple suitable
2857 * partitions. This is because the GPT
2858 * partition types are defined by us, and
2859 * hence we can define their lookup semantics,
2860 * while for the MBR logic we reuse existing
2861 * definitions, and simply don't want to make
2862 * out the situation. */
2865 log_error("Identified multiple bootable Linux 0x83 partitions on\n"
2867 PARTITION_TABLE_BLURB, arg_image);
2873 r = free_and_strdup(&root, node);
2879 if (!root && !secondary_root) {
2880 log_error("Failed to identify root partition in disk image\n"
2882 PARTITION_TABLE_BLURB, arg_image);
2887 *root_device = root;
2890 *root_device_rw = root_rw;
2892 } else if (secondary_root) {
2893 *root_device = secondary_root;
2894 secondary_root = NULL;
2896 *root_device_rw = secondary_root_rw;
2901 *home_device = home;
2904 *home_device_rw = home_rw;
2911 *srv_device_rw = srv_rw;
2916 log_error("--image= is not supported, compiled without blkid support.");
2921 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2923 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2924 const char *fstype, *p;
2934 p = strappenda(where, directory);
2939 b = blkid_new_probe_from_filename(what);
2943 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2947 blkid_probe_enable_superblocks(b, 1);
2948 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2951 r = blkid_do_safeprobe(b);
2952 if (r == -1 || r == 1) {
2953 log_error("Cannot determine file system type of %s", what);
2955 } else if (r != 0) {
2958 log_error_errno(errno, "Failed to probe %s: %m", what);
2963 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2966 log_error("Failed to determine file system type of %s", what);
2970 if (streq(fstype, "crypto_LUKS")) {
2971 log_error("nspawn currently does not support LUKS disk images.");
2975 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2976 return log_error_errno(errno, "Failed to mount %s: %m", what);
2980 log_error("--image= is not supported, compiled without blkid support.");
2985 static int mount_devices(
2987 const char *root_device, bool root_device_rw,
2988 const char *home_device, bool home_device_rw,
2989 const char *srv_device, bool srv_device_rw) {
2995 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2997 return log_error_errno(r, "Failed to mount root directory: %m");
3001 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3003 return log_error_errno(r, "Failed to mount home directory: %m");
3007 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3009 return log_error_errno(r, "Failed to mount server data directory: %m");
3015 static void loop_remove(int nr, int *image_fd) {
3016 _cleanup_close_ int control = -1;
3022 if (image_fd && *image_fd >= 0) {
3023 r = ioctl(*image_fd, LOOP_CLR_FD);
3025 log_debug_errno(errno, "Failed to close loop image: %m");
3026 *image_fd = safe_close(*image_fd);
3029 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3031 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3035 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3037 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3040 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3048 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3049 return log_error_errno(errno, "Failed to allocate pipe: %m");
3053 return log_error_errno(errno, "Failed to fork getent child: %m");
3054 else if (pid == 0) {
3056 char *empty_env = NULL;
3058 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3059 _exit(EXIT_FAILURE);
3061 if (pipe_fds[0] > 2)
3062 safe_close(pipe_fds[0]);
3063 if (pipe_fds[1] > 2)
3064 safe_close(pipe_fds[1]);
3066 nullfd = open("/dev/null", O_RDWR);
3068 _exit(EXIT_FAILURE);
3070 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3071 _exit(EXIT_FAILURE);
3073 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3074 _exit(EXIT_FAILURE);
3079 reset_all_signal_handlers();
3080 close_all_fds(NULL, 0);
3082 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3083 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3084 _exit(EXIT_FAILURE);
3087 pipe_fds[1] = safe_close(pipe_fds[1]);
3094 static int change_uid_gid(char **_home) {
3095 char line[LINE_MAX], *x, *u, *g, *h;
3096 const char *word, *state;
3097 _cleanup_free_ uid_t *uids = NULL;
3098 _cleanup_free_ char *home = NULL;
3099 _cleanup_fclose_ FILE *f = NULL;
3100 _cleanup_close_ int fd = -1;
3101 unsigned n_uids = 0;
3110 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3111 /* Reset everything fully to 0, just in case */
3113 if (setgroups(0, NULL) < 0)
3114 return log_error_errno(errno, "setgroups() failed: %m");
3116 if (setresgid(0, 0, 0) < 0)
3117 return log_error_errno(errno, "setregid() failed: %m");
3119 if (setresuid(0, 0, 0) < 0)
3120 return log_error_errno(errno, "setreuid() failed: %m");
3126 /* First, get user credentials */
3127 fd = spawn_getent("passwd", arg_user, &pid);
3131 f = fdopen(fd, "r");
3136 if (!fgets(line, sizeof(line), f)) {
3139 log_error("Failed to resolve user %s.", arg_user);
3143 log_error_errno(errno, "Failed to read from getent: %m");
3149 wait_for_terminate_and_warn("getent passwd", pid, true);
3151 x = strchr(line, ':');
3153 log_error("/etc/passwd entry has invalid user field.");
3157 u = strchr(x+1, ':');
3159 log_error("/etc/passwd entry has invalid password field.");
3166 log_error("/etc/passwd entry has invalid UID field.");
3174 log_error("/etc/passwd entry has invalid GID field.");
3179 h = strchr(x+1, ':');
3181 log_error("/etc/passwd entry has invalid GECOS field.");
3188 log_error("/etc/passwd entry has invalid home directory field.");
3194 r = parse_uid(u, &uid);
3196 log_error("Failed to parse UID of user.");
3200 r = parse_gid(g, &gid);
3202 log_error("Failed to parse GID of user.");
3210 /* Second, get group memberships */
3211 fd = spawn_getent("initgroups", arg_user, &pid);
3216 f = fdopen(fd, "r");
3221 if (!fgets(line, sizeof(line), f)) {
3223 log_error("Failed to resolve user %s.", arg_user);
3227 log_error_errno(errno, "Failed to read from getent: %m");
3233 wait_for_terminate_and_warn("getent initgroups", pid, true);
3235 /* Skip over the username and subsequent separator whitespace */
3237 x += strcspn(x, WHITESPACE);
3238 x += strspn(x, WHITESPACE);
3240 FOREACH_WORD(word, l, x, state) {
3246 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3249 r = parse_uid(c, &uids[n_uids++]);
3251 log_error("Failed to parse group data from getent.");
3256 r = mkdir_parents(home, 0775);
3258 return log_error_errno(r, "Failed to make home root directory: %m");
3260 r = mkdir_safe(home, 0755, uid, gid);
3261 if (r < 0 && r != -EEXIST)
3262 return log_error_errno(r, "Failed to make home directory: %m");
3264 fchown(STDIN_FILENO, uid, gid);
3265 fchown(STDOUT_FILENO, uid, gid);
3266 fchown(STDERR_FILENO, uid, gid);
3268 if (setgroups(n_uids, uids) < 0)
3269 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3271 if (setresgid(gid, gid, gid) < 0)
3272 return log_error_errno(errno, "setregid() failed: %m");
3274 if (setresuid(uid, uid, uid) < 0)
3275 return log_error_errno(errno, "setreuid() failed: %m");
3287 * < 0 : wait_for_terminate() failed to get the state of the
3288 * container, the container was terminated by a signal, or
3289 * failed for an unknown reason. No change is made to the
3290 * container argument.
3291 * > 0 : The program executed in the container terminated with an
3292 * error. The exit code of the program executed in the
3293 * container is returned. The container argument has been set
3294 * to CONTAINER_TERMINATED.
3295 * 0 : The container is being rebooted, has been shut down or exited
3296 * successfully. The container argument has been set to either
3297 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3299 * That is, success is indicated by a return value of zero, and an
3300 * error is indicated by a non-zero value.
3302 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3306 r = wait_for_terminate(pid, &status);
3308 return log_warning_errno(r, "Failed to wait for container: %m");
3310 switch (status.si_code) {
3313 if (status.si_status == 0) {
3314 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3317 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3319 *container = CONTAINER_TERMINATED;
3320 return status.si_status;
3323 if (status.si_status == SIGINT) {
3325 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3326 *container = CONTAINER_TERMINATED;
3329 } else if (status.si_status == SIGHUP) {
3331 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3332 *container = CONTAINER_REBOOTED;
3336 /* CLD_KILLED fallthrough */
3339 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3343 log_error("Container %s failed due to unknown reason.", arg_machine);
3350 static void nop_handler(int sig) {}
3352 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3355 pid = PTR_TO_UINT32(userdata);
3357 if (kill(pid, SIGRTMIN+3) >= 0) {
3358 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3359 sd_event_source_set_userdata(s, NULL);
3364 sd_event_exit(sd_event_source_get_event(s), 0);
3368 static int determine_names(void) {
3371 if (!arg_image && !arg_directory) {
3373 _cleanup_(image_unrefp) Image *i = NULL;
3375 r = image_find(arg_machine, &i);
3377 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3379 log_error("No image for machine '%s': %m", arg_machine);
3383 if (i->type == IMAGE_RAW)
3384 r = set_sanitized_path(&arg_image, i->path);
3386 r = set_sanitized_path(&arg_directory, i->path);
3388 return log_error_errno(r, "Invalid image directory: %m");
3390 arg_read_only = arg_read_only || i->read_only;
3392 arg_directory = get_current_dir_name();
3394 if (!arg_directory && !arg_machine) {
3395 log_error("Failed to determine path, please use -D or -i.");
3401 if (arg_directory && path_equal(arg_directory, "/"))
3402 arg_machine = gethostname_malloc();
3404 arg_machine = strdup(basename(arg_image ?: arg_directory));
3409 hostname_cleanup(arg_machine, false);
3410 if (!machine_name_is_valid(arg_machine)) {
3411 log_error("Failed to determine machine name automatically, please use -M.");
3415 if (arg_ephemeral) {
3418 /* Add a random suffix when this is an
3419 * ephemeral machine, so that we can run many
3420 * instances at once without manually having
3421 * to specify -M each time. */
3423 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3434 int main(int argc, char *argv[]) {
3436 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3437 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3438 _cleanup_close_ int master = -1, image_fd = -1;
3439 _cleanup_fdset_free_ FDSet *fds = NULL;
3440 int r, n_fd_passed, loop_nr = -1;
3441 char veth_name[IFNAMSIZ];
3442 bool secondary = false, remove_subvol = false;
3443 sigset_t mask, mask_chld;
3445 int ret = EXIT_SUCCESS;
3446 union in_addr_union exposed = {};
3447 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3449 log_parse_environment();
3452 r = parse_argv(argc, argv);
3456 r = determine_names();
3460 if (geteuid() != 0) {
3461 log_error("Need to be root.");
3466 if (sd_booted() <= 0) {
3467 log_error("Not running on a systemd system.");
3473 n_fd_passed = sd_listen_fds(false);
3474 if (n_fd_passed > 0) {
3475 r = fdset_new_listen_fds(&fds, false);
3477 log_error_errno(r, "Failed to collect file descriptors: %m");
3481 fdset_close_others(fds);
3484 if (arg_directory) {
3487 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3488 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3493 if (arg_ephemeral) {
3494 _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3497 /* If the specified path is a mount point we
3498 * generate the new snapshot immediately
3499 * inside it under a random name. However if
3500 * the specified is not a mount point we
3501 * create the new snapshot in the parent
3502 * directory, just next to it. */
3503 r = path_is_mount_point(arg_directory, false);
3505 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3509 r = tempfn_random_child(arg_directory, &np);
3511 r = tempfn_random(arg_directory, &np);
3513 log_error_errno(r, "Failed to generate name for snapshot: %m");
3517 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3519 log_error_errno(r, "Failed to lock %s: %m", np);
3523 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3526 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3530 free(arg_directory);
3533 remove_subvol = true;
3536 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3538 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3542 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3547 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3550 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3552 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3556 log_info("Populated %s from template %s.", arg_directory, arg_template);
3562 if (path_is_os_tree(arg_directory) <= 0) {
3563 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3570 p = strappenda(arg_directory,
3571 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3572 if (access(p, F_OK) < 0) {
3573 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3580 char template[] = "/tmp/nspawn-root-XXXXXX";
3583 assert(!arg_template);
3585 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3587 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3591 r = log_error_errno(r, "Failed to create image lock: %m");
3595 if (!mkdtemp(template)) {
3596 log_error_errno(errno, "Failed to create temporary directory: %m");
3601 arg_directory = strdup(template);
3602 if (!arg_directory) {
3607 image_fd = setup_image(&device_path, &loop_nr);
3613 r = dissect_image(image_fd,
3614 &root_device, &root_device_rw,
3615 &home_device, &home_device_rw,
3616 &srv_device, &srv_device_rw,
3622 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3624 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3628 r = ptsname_malloc(master, &console);
3630 r = log_error_errno(r, "Failed to determine tty name: %m");
3635 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3636 arg_machine, arg_image ?: arg_directory);
3638 if (unlockpt(master) < 0) {
3639 r = log_error_errno(errno, "Failed to unlock tty: %m");
3643 assert_se(sigemptyset(&mask) == 0);
3644 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3645 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3647 assert_se(sigemptyset(&mask_chld) == 0);
3648 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3651 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3652 ContainerStatus container_status;
3653 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3654 struct sigaction sa = {
3655 .sa_handler = nop_handler,
3656 .sa_flags = SA_NOCLDSTOP,
3659 r = barrier_create(&barrier);
3661 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3665 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3666 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3670 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3671 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3675 /* Child can be killed before execv(), so handle SIGCHLD
3676 * in order to interrupt parent's blocking calls and
3677 * give it a chance to call wait() and terminate. */
3678 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3680 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3684 r = sigaction(SIGCHLD, &sa, NULL);
3686 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3690 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3691 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3692 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3694 if (errno == EINVAL)
3695 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3697 r = log_error_errno(errno, "clone() failed: %m");
3704 _cleanup_free_ char *home = NULL;
3706 const char *envp[] = {
3707 "PATH=" DEFAULT_PATH_SPLIT_USR,
3708 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3713 NULL, /* container_uuid */
3714 NULL, /* LISTEN_FDS */
3715 NULL, /* LISTEN_PID */
3720 barrier_set_role(&barrier, BARRIER_CHILD);
3722 envp[n_env] = strv_find_prefix(environ, "TERM=");
3726 master = safe_close(master);
3728 close_nointr(STDIN_FILENO);
3729 close_nointr(STDOUT_FILENO);
3730 close_nointr(STDERR_FILENO);
3732 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3733 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3735 reset_all_signal_handlers();
3736 reset_signal_mask();
3738 r = open_terminal(console, O_RDWR);
3739 if (r != STDIN_FILENO) {
3745 log_error_errno(r, "Failed to open console: %m");
3746 _exit(EXIT_FAILURE);
3749 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3750 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3751 log_error_errno(errno, "Failed to duplicate console: %m");
3752 _exit(EXIT_FAILURE);
3756 log_error_errno(errno, "setsid() failed: %m");
3757 _exit(EXIT_FAILURE);
3760 if (reset_audit_loginuid() < 0)
3761 _exit(EXIT_FAILURE);
3763 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3764 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3765 _exit(EXIT_FAILURE);
3768 /* Mark everything as slave, so that we still
3769 * receive mounts from the real root, but don't
3770 * propagate mounts to the real root. */
3771 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3772 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3773 _exit(EXIT_FAILURE);
3776 if (mount_devices(arg_directory,
3777 root_device, root_device_rw,
3778 home_device, home_device_rw,
3779 srv_device, srv_device_rw) < 0)
3780 _exit(EXIT_FAILURE);
3782 /* Turn directory into bind mount */
3783 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3784 log_error_errno(errno, "Failed to make bind mount: %m");
3785 _exit(EXIT_FAILURE);
3788 r = setup_volatile(arg_directory);
3790 _exit(EXIT_FAILURE);
3792 if (setup_volatile_state(arg_directory) < 0)
3793 _exit(EXIT_FAILURE);
3795 r = base_filesystem_create(arg_directory);
3797 _exit(EXIT_FAILURE);
3799 if (arg_read_only) {
3800 r = bind_remount_recursive(arg_directory, true);
3802 log_error_errno(r, "Failed to make tree read-only: %m");
3803 _exit(EXIT_FAILURE);
3807 if (mount_all(arg_directory) < 0)
3808 _exit(EXIT_FAILURE);
3810 if (copy_devnodes(arg_directory) < 0)
3811 _exit(EXIT_FAILURE);
3813 if (setup_ptmx(arg_directory) < 0)
3814 _exit(EXIT_FAILURE);
3816 dev_setup(arg_directory);
3818 if (setup_propagate(arg_directory) < 0)
3819 _exit(EXIT_FAILURE);
3821 if (setup_seccomp() < 0)
3822 _exit(EXIT_FAILURE);
3824 if (setup_dev_console(arg_directory, console) < 0)
3825 _exit(EXIT_FAILURE);
3827 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3828 _exit(EXIT_FAILURE);
3829 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3831 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3832 _exit(EXIT_FAILURE);
3833 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3835 /* Tell the parent that we are ready, and that
3836 * it can cgroupify us to that we lack access
3837 * to certain devices and resources. */
3838 (void) barrier_place(&barrier);
3840 if (setup_boot_id(arg_directory) < 0)
3841 _exit(EXIT_FAILURE);
3843 if (setup_timezone(arg_directory) < 0)
3844 _exit(EXIT_FAILURE);
3846 if (setup_resolv_conf(arg_directory) < 0)
3847 _exit(EXIT_FAILURE);
3849 if (setup_journal(arg_directory) < 0)
3850 _exit(EXIT_FAILURE);
3852 if (mount_binds(arg_directory, arg_bind, false) < 0)
3853 _exit(EXIT_FAILURE);
3855 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3856 _exit(EXIT_FAILURE);
3858 if (mount_tmpfs(arg_directory) < 0)
3859 _exit(EXIT_FAILURE);
3861 /* Wait until we are cgroup-ified, so that we
3862 * can mount the right cgroup path writable */
3863 (void) barrier_sync_next(&barrier);
3865 if (mount_cgroup(arg_directory) < 0)
3866 _exit(EXIT_FAILURE);
3868 if (chdir(arg_directory) < 0) {
3869 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3870 _exit(EXIT_FAILURE);
3873 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3874 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3875 _exit(EXIT_FAILURE);
3878 if (chroot(".") < 0) {
3879 log_error_errno(errno, "chroot() failed: %m");
3880 _exit(EXIT_FAILURE);
3883 if (chdir("/") < 0) {
3884 log_error_errno(errno, "chdir() failed: %m");
3885 _exit(EXIT_FAILURE);
3890 if (arg_private_network)
3893 if (drop_capabilities() < 0) {
3894 log_error_errno(errno, "drop_capabilities() failed: %m");
3895 _exit(EXIT_FAILURE);
3898 r = change_uid_gid(&home);
3900 _exit(EXIT_FAILURE);
3902 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3903 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3904 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3906 _exit(EXIT_FAILURE);
3909 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3912 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3914 _exit(EXIT_FAILURE);
3918 if (fdset_size(fds) > 0) {
3919 r = fdset_cloexec(fds, false);
3921 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3922 _exit(EXIT_FAILURE);
3925 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3926 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3928 _exit(EXIT_FAILURE);
3934 if (arg_personality != 0xffffffffLU) {
3935 if (personality(arg_personality) < 0) {
3936 log_error_errno(errno, "personality() failed: %m");
3937 _exit(EXIT_FAILURE);
3939 } else if (secondary) {
3940 if (personality(PER_LINUX32) < 0) {
3941 log_error_errno(errno, "personality() failed: %m");
3942 _exit(EXIT_FAILURE);
3947 if (arg_selinux_context)
3948 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3949 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3950 _exit(EXIT_FAILURE);
3954 if (!strv_isempty(arg_setenv)) {
3957 n = strv_env_merge(2, envp, arg_setenv);
3960 _exit(EXIT_FAILURE);
3965 env_use = (char**) envp;
3967 /* Wait until the parent is ready with the setup, too... */
3968 if (!barrier_place_and_sync(&barrier))
3969 _exit(EXIT_FAILURE);
3975 /* Automatically search for the init system */
3977 l = 1 + argc - optind;
3978 a = newa(char*, l + 1);
3979 memcpy(a + 1, argv + optind, l * sizeof(char*));
3981 a[0] = (char*) "/usr/lib/systemd/systemd";
3982 execve(a[0], a, env_use);
3984 a[0] = (char*) "/lib/systemd/systemd";
3985 execve(a[0], a, env_use);
3987 a[0] = (char*) "/sbin/init";
3988 execve(a[0], a, env_use);
3989 } else if (argc > optind)
3990 execvpe(argv[optind], argv + optind, env_use);
3992 chdir(home ? home : "/root");
3993 execle("/bin/bash", "-bash", NULL, env_use);
3994 execle("/bin/sh", "-sh", NULL, env_use);
3997 log_error_errno(errno, "execv() failed: %m");
3998 _exit(EXIT_FAILURE);
4001 barrier_set_role(&barrier, BARRIER_PARENT);
4005 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4006 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4008 /* Wait for the most basic Child-setup to be done,
4009 * before we add hardware to it, and place it in a
4011 if (barrier_sync_next(&barrier)) {
4014 r = move_network_interfaces(pid);
4018 r = setup_veth(pid, veth_name, &ifi);
4022 r = setup_bridge(veth_name, &ifi);
4026 r = setup_macvlan(pid);
4030 r = register_machine(pid, ifi);
4034 /* Block SIGCHLD here, before notifying child.
4035 * process_pty() will handle it with the other signals. */
4036 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4040 /* Reset signal to default */
4041 r = default_signals(SIGCHLD, -1);
4045 /* Notify the child that the parent is ready with all
4046 * its setup, and that the child can now hand over
4047 * control to the code to run inside the container. */
4048 (void) barrier_place(&barrier);
4050 /* And wait that the child is completely ready now. */
4051 if (barrier_place_and_sync(&barrier)) {
4052 _cleanup_event_unref_ sd_event *event = NULL;
4053 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4054 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4059 "STATUS=Container running.\n"
4060 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4062 r = sd_event_new(&event);
4064 log_error_errno(r, "Failed to get default event source: %m");
4069 /* Try to kill the init system on SIGINT or SIGTERM */
4070 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4071 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4073 /* Immediately exit */
4074 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4075 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4078 /* simply exit on sigchld */
4079 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4081 if (arg_expose_ports) {
4082 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4086 (void) expose_ports(rtnl, &exposed);
4089 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4091 r = pty_forward_new(event, master, true, &forward);
4093 log_error_errno(r, "Failed to create PTY forwarder: %m");
4097 r = sd_event_loop(event);
4099 log_error_errno(r, "Failed to run event loop: %m");
4103 pty_forward_get_last_char(forward, &last_char);
4105 forward = pty_forward_free(forward);
4107 if (!arg_quiet && last_char != '\n')
4110 /* Kill if it is not dead yet anyway */
4111 terminate_machine(pid);
4115 /* Normally redundant, but better safe than sorry */
4118 r = wait_for_container(pid, &container_status);
4122 /* We failed to wait for the container, or the
4123 * container exited abnormally */
4125 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4126 /* The container exited with a non-zero
4127 * status, or with zero status and no reboot
4133 /* CONTAINER_REBOOTED, loop again */
4135 if (arg_keep_unit) {
4136 /* Special handling if we are running as a
4137 * service: instead of simply restarting the
4138 * machine we want to restart the entire
4139 * service, so let's inform systemd about this
4140 * with the special exit code 133. The service
4141 * file uses RestartForceExitStatus=133 so
4142 * that this results in a full nspawn
4143 * restart. This is necessary since we might
4144 * have cgroup parameters set we want to have
4151 flush_ports(&exposed);
4157 "STATUS=Terminating...");
4159 loop_remove(loop_nr, &image_fd);
4164 if (remove_subvol && arg_directory) {
4167 k = btrfs_subvol_remove(arg_directory);
4169 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4175 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4176 (void) rm_rf(p, false, true, false);
4179 free(arg_directory);
4184 strv_free(arg_setenv);
4185 strv_free(arg_network_interfaces);
4186 strv_free(arg_network_macvlan);
4187 strv_free(arg_bind);
4188 strv_free(arg_bind_ro);
4189 strv_free(arg_tmpfs);
4191 flush_ports(&exposed);
4193 while (arg_expose_ports) {
4194 ExposePort *p = arg_expose_ports;
4195 LIST_REMOVE(ports, arg_expose_ports, p);
4199 return r < 0 ? EXIT_FAILURE : ret;