1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
36 #include <sys/signalfd.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
48 #include <selinux/selinux.h>
56 #include <blkid/blkid.h>
59 #include "sd-daemon.h"
69 #include "cgroup-util.h"
71 #include "path-util.h"
72 #include "loopback-setup.h"
73 #include "dev-setup.h"
78 #include "bus-error.h"
80 #include "bus-kernel.h"
83 #include "rtnl-util.h"
84 #include "udev-util.h"
85 #include "blkid-util.h"
87 #include "siphash24.h"
89 #include "base-filesystem.h"
91 #include "event-util.h"
92 #include "capability.h"
94 #include "btrfs-util.h"
95 #include "machine-image.h"
98 #include "seccomp-util.h"
101 typedef enum ContainerStatus {
102 CONTAINER_TERMINATED,
106 typedef enum LinkJournal {
113 typedef enum Volatile {
119 static char *arg_directory = NULL;
120 static char *arg_template = NULL;
121 static char *arg_user = NULL;
122 static sd_id128_t arg_uuid = {};
123 static char *arg_machine = NULL;
124 static const char *arg_selinux_context = NULL;
125 static const char *arg_selinux_apifs_context = NULL;
126 static const char *arg_slice = NULL;
127 static bool arg_private_network = false;
128 static bool arg_read_only = false;
129 static bool arg_boot = false;
130 static bool arg_ephemeral = false;
131 static LinkJournal arg_link_journal = LINK_AUTO;
132 static bool arg_link_journal_try = false;
133 static uint64_t arg_retain =
134 (1ULL << CAP_CHOWN) |
135 (1ULL << CAP_DAC_OVERRIDE) |
136 (1ULL << CAP_DAC_READ_SEARCH) |
137 (1ULL << CAP_FOWNER) |
138 (1ULL << CAP_FSETID) |
139 (1ULL << CAP_IPC_OWNER) |
141 (1ULL << CAP_LEASE) |
142 (1ULL << CAP_LINUX_IMMUTABLE) |
143 (1ULL << CAP_NET_BIND_SERVICE) |
144 (1ULL << CAP_NET_BROADCAST) |
145 (1ULL << CAP_NET_RAW) |
146 (1ULL << CAP_SETGID) |
147 (1ULL << CAP_SETFCAP) |
148 (1ULL << CAP_SETPCAP) |
149 (1ULL << CAP_SETUID) |
150 (1ULL << CAP_SYS_ADMIN) |
151 (1ULL << CAP_SYS_CHROOT) |
152 (1ULL << CAP_SYS_NICE) |
153 (1ULL << CAP_SYS_PTRACE) |
154 (1ULL << CAP_SYS_TTY_CONFIG) |
155 (1ULL << CAP_SYS_RESOURCE) |
156 (1ULL << CAP_SYS_BOOT) |
157 (1ULL << CAP_AUDIT_WRITE) |
158 (1ULL << CAP_AUDIT_CONTROL) |
160 static char **arg_bind = NULL;
161 static char **arg_bind_ro = NULL;
162 static char **arg_tmpfs = NULL;
163 static char **arg_setenv = NULL;
164 static bool arg_quiet = false;
165 static bool arg_share_system = false;
166 static bool arg_register = true;
167 static bool arg_keep_unit = false;
168 static char **arg_network_interfaces = NULL;
169 static char **arg_network_macvlan = NULL;
170 static bool arg_network_veth = false;
171 static const char *arg_network_bridge = NULL;
172 static unsigned long arg_personality = 0xffffffffLU;
173 static char *arg_image = NULL;
174 static Volatile arg_volatile = VOLATILE_NO;
176 static void help(void) {
177 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
178 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
179 " -h --help Show this help\n"
180 " --version Print version string\n"
181 " -q --quiet Do not show status information\n"
182 " -D --directory=PATH Root directory for the container\n"
183 " --template=PATH Initialize root directory from template directory,\n"
185 " -x --ephemeral Run container with snapshot of root directory, and\n"
186 " remove it after exit\n"
187 " -i --image=PATH File system device or disk image for the container\n"
188 " -b --boot Boot up full system (i.e. invoke init)\n"
189 " -u --user=USER Run the command under specified user or uid\n"
190 " -M --machine=NAME Set the machine name for the container\n"
191 " --uuid=UUID Set a specific machine UUID for the container\n"
192 " -S --slice=SLICE Place the container in the specified slice\n"
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
200 " --network-veth Add a virtual ethernet connection between host\n"
202 " --network-bridge=INTERFACE\n"
203 " Add a virtual ethernet connection between host\n"
204 " and container and add it to an existing bridge on\n"
206 " -Z --selinux-context=SECLABEL\n"
207 " Set the SELinux security context to be used by\n"
208 " processes in the container\n"
209 " -L --selinux-apifs-context=SECLABEL\n"
210 " Set the SELinux security context to be used by\n"
211 " API/tmpfs file systems in the container\n"
212 " --capability=CAP In addition to the default, retain specified\n"
214 " --drop-capability=CAP Drop the specified capability from the default set\n"
215 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
216 " try-guest, try-host\n"
217 " -j Equivalent to --link-journal=try-guest\n"
218 " --read-only Mount the root directory read-only\n"
219 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
221 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
222 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
223 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
224 " --share-system Share system namespaces with host\n"
225 " --register=BOOLEAN Register container as machine\n"
226 " --keep-unit Do not register a scope for the machine, reuse\n"
227 " the service unit nspawn is running in\n"
228 " --volatile[=MODE] Run the system in volatile mode\n",
229 program_invocation_short_name);
232 static int set_sanitized_path(char **b, const char *path) {
238 p = canonicalize_file_name(path);
243 p = path_make_absolute_cwd(path);
249 *b = path_kill_slashes(p);
253 static int parse_argv(int argc, char *argv[]) {
270 ARG_NETWORK_INTERFACE,
279 static const struct option options[] = {
280 { "help", no_argument, NULL, 'h' },
281 { "version", no_argument, NULL, ARG_VERSION },
282 { "directory", required_argument, NULL, 'D' },
283 { "template", required_argument, NULL, ARG_TEMPLATE },
284 { "ephemeral", no_argument, NULL, 'x' },
285 { "user", required_argument, NULL, 'u' },
286 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
287 { "boot", no_argument, NULL, 'b' },
288 { "uuid", required_argument, NULL, ARG_UUID },
289 { "read-only", no_argument, NULL, ARG_READ_ONLY },
290 { "capability", required_argument, NULL, ARG_CAPABILITY },
291 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
292 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
293 { "bind", required_argument, NULL, ARG_BIND },
294 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
295 { "tmpfs", required_argument, NULL, ARG_TMPFS },
296 { "machine", required_argument, NULL, 'M' },
297 { "slice", required_argument, NULL, 'S' },
298 { "setenv", required_argument, NULL, ARG_SETENV },
299 { "selinux-context", required_argument, NULL, 'Z' },
300 { "selinux-apifs-context", required_argument, NULL, 'L' },
301 { "quiet", no_argument, NULL, 'q' },
302 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
303 { "register", required_argument, NULL, ARG_REGISTER },
304 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
305 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
306 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
307 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
308 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
309 { "personality", required_argument, NULL, ARG_PERSONALITY },
310 { "image", required_argument, NULL, 'i' },
311 { "volatile", optional_argument, NULL, ARG_VOLATILE },
316 uint64_t plus = 0, minus = 0;
321 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
330 puts(PACKAGE_STRING);
331 puts(SYSTEMD_FEATURES);
335 r = set_sanitized_path(&arg_directory, optarg);
337 return log_error_errno(r, "Invalid root directory: %m");
342 r = set_sanitized_path(&arg_template, optarg);
344 return log_error_errno(r, "Invalid template directory: %m");
349 r = set_sanitized_path(&arg_image, optarg);
351 return log_error_errno(r, "Invalid image path: %m");
356 arg_ephemeral = true;
361 arg_user = strdup(optarg);
367 case ARG_NETWORK_BRIDGE:
368 arg_network_bridge = optarg;
372 case ARG_NETWORK_VETH:
373 arg_network_veth = true;
374 arg_private_network = true;
377 case ARG_NETWORK_INTERFACE:
378 if (strv_extend(&arg_network_interfaces, optarg) < 0)
381 arg_private_network = true;
384 case ARG_NETWORK_MACVLAN:
385 if (strv_extend(&arg_network_macvlan, optarg) < 0)
390 case ARG_PRIVATE_NETWORK:
391 arg_private_network = true;
399 r = sd_id128_from_string(optarg, &arg_uuid);
401 log_error("Invalid UUID: %s", optarg);
411 if (isempty(optarg)) {
415 if (!machine_name_is_valid(optarg)) {
416 log_error("Invalid machine name: %s", optarg);
420 r = free_and_strdup(&arg_machine, optarg);
428 arg_selinux_context = optarg;
432 arg_selinux_apifs_context = optarg;
436 arg_read_only = true;
440 case ARG_DROP_CAPABILITY: {
441 const char *state, *word;
444 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
445 _cleanup_free_ char *t;
447 t = strndup(word, length);
451 if (streq(t, "all")) {
452 if (c == ARG_CAPABILITY)
453 plus = (uint64_t) -1;
455 minus = (uint64_t) -1;
459 cap = capability_from_name(t);
461 log_error("Failed to parse capability %s.", t);
465 if (c == ARG_CAPABILITY)
466 plus |= 1ULL << (uint64_t) cap;
468 minus |= 1ULL << (uint64_t) cap;
476 arg_link_journal = LINK_GUEST;
477 arg_link_journal_try = true;
480 case ARG_LINK_JOURNAL:
481 if (streq(optarg, "auto")) {
482 arg_link_journal = LINK_AUTO;
483 arg_link_journal_try = false;
484 } else if (streq(optarg, "no")) {
485 arg_link_journal = LINK_NO;
486 arg_link_journal_try = false;
487 } else if (streq(optarg, "guest")) {
488 arg_link_journal = LINK_GUEST;
489 arg_link_journal_try = false;
490 } else if (streq(optarg, "host")) {
491 arg_link_journal = LINK_HOST;
492 arg_link_journal_try = false;
493 } else if (streq(optarg, "try-guest")) {
494 arg_link_journal = LINK_GUEST;
495 arg_link_journal_try = true;
496 } else if (streq(optarg, "try-host")) {
497 arg_link_journal = LINK_HOST;
498 arg_link_journal_try = true;
500 log_error("Failed to parse link journal mode %s", optarg);
508 _cleanup_free_ char *a = NULL, *b = NULL;
512 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
514 e = strchr(optarg, ':');
516 a = strndup(optarg, e - optarg);
526 if (!path_is_absolute(a) || !path_is_absolute(b)) {
527 log_error("Invalid bind mount specification: %s", optarg);
531 r = strv_extend(x, a);
535 r = strv_extend(x, b);
543 _cleanup_free_ char *a = NULL, *b = NULL;
546 e = strchr(optarg, ':');
548 a = strndup(optarg, e - optarg);
552 b = strdup("mode=0755");
558 if (!path_is_absolute(a)) {
559 log_error("Invalid tmpfs specification: %s", optarg);
563 r = strv_push(&arg_tmpfs, a);
569 r = strv_push(&arg_tmpfs, b);
581 if (!env_assignment_is_valid(optarg)) {
582 log_error("Environment variable assignment '%s' is not valid.", optarg);
586 n = strv_env_set(arg_setenv, optarg);
590 strv_free(arg_setenv);
599 case ARG_SHARE_SYSTEM:
600 arg_share_system = true;
604 r = parse_boolean(optarg);
606 log_error("Failed to parse --register= argument: %s", optarg);
614 arg_keep_unit = true;
617 case ARG_PERSONALITY:
619 arg_personality = personality_from_string(optarg);
620 if (arg_personality == 0xffffffffLU) {
621 log_error("Unknown or unsupported personality '%s'.", optarg);
630 arg_volatile = VOLATILE_YES;
632 r = parse_boolean(optarg);
634 if (streq(optarg, "state"))
635 arg_volatile = VOLATILE_STATE;
637 log_error("Failed to parse --volatile= argument: %s", optarg);
641 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
650 assert_not_reached("Unhandled option");
653 if (arg_share_system)
654 arg_register = false;
656 if (arg_boot && arg_share_system) {
657 log_error("--boot and --share-system may not be combined.");
661 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
662 log_error("--keep-unit may not be used when invoked from a user session.");
666 if (arg_directory && arg_image) {
667 log_error("--directory= and --image= may not be combined.");
671 if (arg_template && arg_image) {
672 log_error("--template= and --image= may not be combined.");
676 if (arg_template && !(arg_directory || arg_machine)) {
677 log_error("--template= needs --directory= or --machine=.");
681 if (arg_ephemeral && arg_template) {
682 log_error("--ephemeral and --template= may not be combined.");
686 if (arg_ephemeral && arg_image) {
687 log_error("--ephemeral and --image= may not be combined.");
691 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
692 log_error("--ephemeral and --link-journal= may not be combined.");
696 if (arg_volatile != VOLATILE_NO && arg_read_only) {
697 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
701 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
706 static int mount_all(const char *dest) {
708 typedef struct MountPoint {
717 static const MountPoint mount_table[] = {
718 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
719 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
720 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
721 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
722 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
723 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
724 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
725 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
727 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
728 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
735 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
736 _cleanup_free_ char *where = NULL;
738 _cleanup_free_ char *options = NULL;
743 where = strjoin(dest, "/", mount_table[k].where, NULL);
747 t = path_is_mount_point(where, true);
749 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
757 /* Skip this entry if it is not a remount. */
758 if (mount_table[k].what && t > 0)
761 t = mkdir_p(where, 0755);
763 if (mount_table[k].fatal) {
764 log_error_errno(t, "Failed to create directory %s: %m", where);
769 log_warning_errno(t, "Failed to create directory %s: %m", where);
775 if (arg_selinux_apifs_context &&
776 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
777 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
784 o = mount_table[k].options;
787 if (mount(mount_table[k].what,
790 mount_table[k].flags,
793 if (mount_table[k].fatal) {
794 log_error_errno(errno, "mount(%s) failed: %m", where);
799 log_warning_errno(errno, "mount(%s) failed: %m", where);
806 static int mount_binds(const char *dest, char **l, bool ro) {
809 STRV_FOREACH_PAIR(x, y, l) {
810 _cleanup_free_ char *where = NULL;
811 struct stat source_st, dest_st;
814 if (stat(*x, &source_st) < 0)
815 return log_error_errno(errno, "Failed to stat %s: %m", *x);
817 where = strappend(dest, *y);
821 r = stat(where, &dest_st);
823 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
824 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
827 } else if (errno == ENOENT) {
828 r = mkdir_parents_label(where, 0755);
830 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
832 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
836 /* Create the mount point, but be conservative -- refuse to create block
837 * and char devices. */
838 if (S_ISDIR(source_st.st_mode)) {
839 r = mkdir_label(where, 0755);
840 if (r < 0 && errno != EEXIST)
841 return log_error_errno(r, "Failed to create mount point %s: %m", where);
842 } else if (S_ISFIFO(source_st.st_mode)) {
843 r = mkfifo(where, 0644);
844 if (r < 0 && errno != EEXIST)
845 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
846 } else if (S_ISSOCK(source_st.st_mode)) {
847 r = mknod(where, 0644 | S_IFSOCK, 0);
848 if (r < 0 && errno != EEXIST)
849 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
850 } else if (S_ISREG(source_st.st_mode)) {
853 return log_error_errno(r, "Failed to create mount point %s: %m", where);
855 log_error("Refusing to create mountpoint for file: %s", *x);
859 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
860 return log_error_errno(errno, "mount(%s) failed: %m", where);
863 r = bind_remount_recursive(where, true);
865 return log_error_errno(r, "Read-Only bind mount failed: %m");
872 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
876 to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
878 r = path_is_mount_point(to, false);
880 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
886 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
887 return log_error_errno(errno, "Failed to mount to %s: %m", to);
892 static int mount_cgroup(const char *dest) {
893 _cleanup_set_free_free_ Set *controllers = NULL;
894 _cleanup_free_ char *own_cgroup_path = NULL;
895 const char *cgroup_root, *systemd_root, *systemd_own;
898 controllers = set_new(&string_hash_ops);
902 r = cg_kernel_controllers(controllers);
904 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
906 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
908 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
910 cgroup_root = strappenda(dest, "/sys/fs/cgroup");
911 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
912 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
915 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
917 controller = set_steal_first(controllers);
921 origin = strappend("/sys/fs/cgroup/", controller);
925 r = readlink_malloc(origin, &combined);
927 /* Not a symbolic link, but directly a single cgroup hierarchy */
929 r = mount_cgroup_hierarchy(dest, controller, controller, true);
934 return log_error_errno(r, "Failed to read link %s: %m", origin);
936 _cleanup_free_ char *target = NULL;
938 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
942 /* A symbolic link, a combination of controllers in one hierarchy */
944 if (!filename_is_valid(combined)) {
945 log_warning("Ignoring invalid combined hierarchy %s.", combined);
949 r = mount_cgroup_hierarchy(dest, combined, combined, true);
953 if (symlink(combined, target) < 0)
954 return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
958 r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
962 /* Make our own cgroup a (writable) bind mount */
963 systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
964 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
965 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
967 /* And then remount the systemd cgroup root read-only */
968 systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
969 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
970 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
972 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
973 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
978 static int mount_tmpfs(const char *dest) {
981 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
982 _cleanup_free_ char *where = NULL;
985 where = strappend(dest, *i);
989 r = mkdir_label(where, 0755);
990 if (r < 0 && r != -EEXIST)
991 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
993 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
994 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1000 static int setup_timezone(const char *dest) {
1001 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1007 /* Fix the timezone, if possible */
1008 r = readlink_malloc("/etc/localtime", &p);
1010 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1014 z = path_startswith(p, "../usr/share/zoneinfo/");
1016 z = path_startswith(p, "/usr/share/zoneinfo/");
1018 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1022 where = strappend(dest, "/etc/localtime");
1026 r = readlink_malloc(where, &q);
1028 y = path_startswith(q, "../usr/share/zoneinfo/");
1030 y = path_startswith(q, "/usr/share/zoneinfo/");
1032 /* Already pointing to the right place? Then do nothing .. */
1033 if (y && streq(y, z))
1037 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1041 if (access(check, F_OK) < 0) {
1042 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1046 what = strappend("../usr/share/zoneinfo/", z);
1050 r = mkdir_parents(where, 0755);
1052 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1058 if (r < 0 && errno != ENOENT) {
1059 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1064 if (symlink(what, where) < 0) {
1065 log_error_errno(errno, "Failed to correct timezone of container: %m");
1072 static int setup_resolv_conf(const char *dest) {
1073 _cleanup_free_ char *where = NULL;
1078 if (arg_private_network)
1081 /* Fix resolv.conf, if possible */
1082 where = strappend(dest, "/etc/resolv.conf");
1086 /* We don't really care for the results of this really. If it
1087 * fails, it fails, but meh... */
1088 r = mkdir_parents(where, 0755);
1090 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1095 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
1097 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1105 static int setup_volatile_state(const char *directory) {
1111 if (arg_volatile != VOLATILE_STATE)
1114 /* --volatile=state means we simply overmount /var
1115 with a tmpfs, and the rest read-only. */
1117 r = bind_remount_recursive(directory, true);
1119 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1121 p = strappenda(directory, "/var");
1123 if (r < 0 && errno != EEXIST)
1124 return log_error_errno(errno, "Failed to create %s: %m", directory);
1126 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1127 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1132 static int setup_volatile(const char *directory) {
1133 bool tmpfs_mounted = false, bind_mounted = false;
1134 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1140 if (arg_volatile != VOLATILE_YES)
1143 /* --volatile=yes means we mount a tmpfs to the root dir, and
1144 the original /usr to use inside it, and that read-only. */
1146 if (!mkdtemp(template))
1147 return log_error_errno(errno, "Failed to create temporary directory: %m");
1149 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1150 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1155 tmpfs_mounted = true;
1157 f = strappenda(directory, "/usr");
1158 t = strappenda(template, "/usr");
1161 if (r < 0 && errno != EEXIST) {
1162 log_error_errno(errno, "Failed to create %s: %m", t);
1167 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1168 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1173 bind_mounted = true;
1175 r = bind_remount_recursive(t, true);
1177 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1181 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1182 log_error_errno(errno, "Failed to move root mount: %m");
1200 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1203 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1204 SD_ID128_FORMAT_VAL(id));
1209 static int setup_boot_id(const char *dest) {
1210 _cleanup_free_ char *from = NULL, *to = NULL;
1211 sd_id128_t rnd = {};
1217 if (arg_share_system)
1220 /* Generate a new randomized boot ID, so that each boot-up of
1221 * the container gets a new one */
1223 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1224 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1228 r = sd_id128_randomize(&rnd);
1230 return log_error_errno(r, "Failed to generate random boot id: %m");
1232 id128_format_as_uuid(rnd, as_uuid);
1234 r = write_string_file(from, as_uuid);
1236 return log_error_errno(r, "Failed to write boot id: %m");
1238 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1239 log_error_errno(errno, "Failed to bind mount boot id: %m");
1241 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1242 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1248 static int copy_devnodes(const char *dest) {
1250 static const char devnodes[] =
1261 _cleanup_umask_ mode_t u;
1267 NULSTR_FOREACH(d, devnodes) {
1268 _cleanup_free_ char *from = NULL, *to = NULL;
1271 from = strappend("/dev/", d);
1272 to = strjoin(dest, "/dev/", d, NULL);
1276 if (stat(from, &st) < 0) {
1278 if (errno != ENOENT)
1279 return log_error_errno(errno, "Failed to stat %s: %m", from);
1281 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1283 log_error("%s is not a char or block device, cannot copy", from);
1287 r = mkdir_parents(to, 0775);
1289 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1293 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1294 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1301 static int setup_ptmx(const char *dest) {
1302 _cleanup_free_ char *p = NULL;
1304 p = strappend(dest, "/dev/ptmx");
1308 if (symlink("pts/ptmx", p) < 0)
1309 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1314 static int setup_dev_console(const char *dest, const char *console) {
1315 _cleanup_umask_ mode_t u;
1325 if (stat("/dev/null", &st) < 0)
1326 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1328 r = chmod_and_chown(console, 0600, 0, 0);
1330 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1332 /* We need to bind mount the right tty to /dev/console since
1333 * ptys can only exist on pts file systems. To have something
1334 * to bind mount things on we create a device node first, and
1335 * use /dev/null for that since we the cgroups device policy
1336 * allows us to create that freely, while we cannot create
1337 * /dev/console. (Note that the major minor doesn't actually
1338 * matter here, since we mount it over anyway). */
1340 to = strappenda(dest, "/dev/console");
1341 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1342 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1344 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1345 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1350 static int setup_kmsg(const char *dest, int kmsg_socket) {
1351 _cleanup_free_ char *from = NULL, *to = NULL;
1353 _cleanup_umask_ mode_t u;
1355 struct cmsghdr cmsghdr;
1356 uint8_t buf[CMSG_SPACE(sizeof(int))];
1358 struct msghdr mh = {
1359 .msg_control = &control,
1360 .msg_controllen = sizeof(control),
1362 struct cmsghdr *cmsg;
1365 assert(kmsg_socket >= 0);
1369 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1370 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1371 * on the reading side behave very similar to /proc/kmsg,
1372 * their writing side behaves differently from /dev/kmsg in
1373 * that writing blocks when nothing is reading. In order to
1374 * avoid any problems with containers deadlocking due to this
1375 * we simply make /dev/kmsg unavailable to the container. */
1376 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1377 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1380 if (mkfifo(from, 0600) < 0)
1381 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1383 r = chmod_and_chown(from, 0600, 0, 0);
1385 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1387 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1388 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1390 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1392 return log_error_errno(errno, "Failed to open fifo: %m");
1394 cmsg = CMSG_FIRSTHDR(&mh);
1395 cmsg->cmsg_level = SOL_SOCKET;
1396 cmsg->cmsg_type = SCM_RIGHTS;
1397 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1398 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1400 mh.msg_controllen = cmsg->cmsg_len;
1402 /* Store away the fd in the socket, so that it stays open as
1403 * long as we run the child */
1404 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1408 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1410 /* And now make the FIFO unavailable as /dev/kmsg... */
1415 static int setup_hostname(void) {
1417 if (arg_share_system)
1420 if (sethostname_idempotent(arg_machine) < 0)
1426 static int setup_journal(const char *directory) {
1427 sd_id128_t machine_id, this_id;
1428 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1432 /* Don't link journals in ephemeral mode */
1436 p = strappend(directory, "/etc/machine-id");
1440 r = read_one_line_file(p, &b);
1441 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1444 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1447 if (isempty(id) && arg_link_journal == LINK_AUTO)
1450 /* Verify validity */
1451 r = sd_id128_from_string(id, &machine_id);
1453 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1455 r = sd_id128_get_machine(&this_id);
1457 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1459 if (sd_id128_equal(machine_id, this_id)) {
1460 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1461 "Host and machine ids are equal (%s): refusing to link journals", id);
1462 if (arg_link_journal == LINK_AUTO)
1467 if (arg_link_journal == LINK_NO)
1471 p = strappend("/var/log/journal/", id);
1472 q = strjoin(directory, "/var/log/journal/", id, NULL);
1476 if (path_is_mount_point(p, false) > 0) {
1477 if (arg_link_journal != LINK_AUTO) {
1478 log_error("%s: already a mount point, refusing to use for journal", p);
1485 if (path_is_mount_point(q, false) > 0) {
1486 if (arg_link_journal != LINK_AUTO) {
1487 log_error("%s: already a mount point, refusing to use for journal", q);
1494 r = readlink_and_make_absolute(p, &d);
1496 if ((arg_link_journal == LINK_GUEST ||
1497 arg_link_journal == LINK_AUTO) &&
1500 r = mkdir_p(q, 0755);
1502 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1507 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1508 } else if (r == -EINVAL) {
1510 if (arg_link_journal == LINK_GUEST &&
1513 if (errno == ENOTDIR) {
1514 log_error("%s already exists and is neither a symlink nor a directory", p);
1517 log_error_errno(errno, "Failed to remove %s: %m", p);
1521 } else if (r != -ENOENT) {
1522 log_error_errno(errno, "readlink(%s) failed: %m", p);
1526 if (arg_link_journal == LINK_GUEST) {
1528 if (symlink(q, p) < 0) {
1529 if (arg_link_journal_try) {
1530 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1533 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1538 r = mkdir_p(q, 0755);
1540 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1544 if (arg_link_journal == LINK_HOST) {
1545 /* don't create parents here -- if the host doesn't have
1546 * permanent journal set up, don't force it here */
1549 if (arg_link_journal_try) {
1550 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1553 log_error_errno(errno, "Failed to create %s: %m", p);
1558 } else if (access(p, F_OK) < 0)
1561 if (dir_is_empty(q) == 0)
1562 log_warning("%s is not empty, proceeding anyway.", q);
1564 r = mkdir_p(q, 0755);
1566 log_error_errno(errno, "Failed to create %s: %m", q);
1570 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1571 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1576 static int drop_capabilities(void) {
1577 return capability_bounding_set_drop(~arg_retain, false);
1580 static int register_machine(pid_t pid, int local_ifindex) {
1581 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1582 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1588 r = sd_bus_default_system(&bus);
1590 return log_error_errno(r, "Failed to open system bus: %m");
1592 if (arg_keep_unit) {
1593 r = sd_bus_call_method(
1595 "org.freedesktop.machine1",
1596 "/org/freedesktop/machine1",
1597 "org.freedesktop.machine1.Manager",
1598 "RegisterMachineWithNetwork",
1603 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1607 strempty(arg_directory),
1608 local_ifindex > 0 ? 1 : 0, local_ifindex);
1610 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1612 r = sd_bus_message_new_method_call(
1615 "org.freedesktop.machine1",
1616 "/org/freedesktop/machine1",
1617 "org.freedesktop.machine1.Manager",
1618 "CreateMachineWithNetwork");
1620 return log_error_errno(r, "Failed to create message: %m");
1622 r = sd_bus_message_append(
1626 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1630 strempty(arg_directory),
1631 local_ifindex > 0 ? 1 : 0, local_ifindex);
1633 return log_error_errno(r, "Failed to append message arguments: %m");
1635 r = sd_bus_message_open_container(m, 'a', "(sv)");
1637 return log_error_errno(r, "Failed to open container: %m");
1639 if (!isempty(arg_slice)) {
1640 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1642 return log_error_errno(r, "Failed to append slice: %m");
1645 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1647 return log_error_errno(r, "Failed to add device policy: %m");
1649 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1650 /* Allow the container to
1651 * access and create the API
1652 * device nodes, so that
1653 * PrivateDevices= in the
1654 * container can work
1659 "/dev/random", "rwm",
1660 "/dev/urandom", "rwm",
1662 "/dev/net/tun", "rwm",
1663 /* Allow the container
1664 * access to ptys. However,
1666 * container to ever create
1667 * these device nodes. */
1668 "/dev/pts/ptmx", "rw",
1671 return log_error_errno(r, "Failed to add device whitelist: %m");
1673 r = sd_bus_message_close_container(m);
1675 return log_error_errno(r, "Failed to close container: %m");
1677 r = sd_bus_call(bus, m, 0, &error, NULL);
1681 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1688 static int terminate_machine(pid_t pid) {
1689 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1690 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1691 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1698 r = sd_bus_default_system(&bus);
1700 return log_error_errno(r, "Failed to open system bus: %m");
1702 r = sd_bus_call_method(
1704 "org.freedesktop.machine1",
1705 "/org/freedesktop/machine1",
1706 "org.freedesktop.machine1.Manager",
1713 /* Note that the machine might already have been
1714 * cleaned up automatically, hence don't consider it a
1715 * failure if we cannot get the machine object. */
1716 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1720 r = sd_bus_message_read(reply, "o", &path);
1722 return bus_log_parse_error(r);
1724 r = sd_bus_call_method(
1726 "org.freedesktop.machine1",
1728 "org.freedesktop.machine1.Machine",
1734 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1741 static int reset_audit_loginuid(void) {
1742 _cleanup_free_ char *p = NULL;
1745 if (arg_share_system)
1748 r = read_one_line_file("/proc/self/loginuid", &p);
1752 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1754 /* Already reset? */
1755 if (streq(p, "4294967295"))
1758 r = write_string_file("/proc/self/loginuid", "4294967295");
1760 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1761 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1762 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1763 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1764 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1772 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1773 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1774 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1776 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1782 l = strlen(arg_machine);
1783 sz = sizeof(sd_id128_t) + l;
1789 /* fetch some persistent data unique to the host */
1790 r = sd_id128_get_machine((sd_id128_t*) v);
1794 /* combine with some data unique (on this host) to this
1795 * container instance */
1796 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1799 memcpy(i, &idx, sizeof(idx));
1802 /* Let's hash the host machine ID plus the container name. We
1803 * use a fixed, but originally randomly created hash key here. */
1804 siphash24(result, v, sz, hash_key.bytes);
1806 assert_cc(ETH_ALEN <= sizeof(result));
1807 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1809 /* see eth_random_addr in the kernel */
1810 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1811 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1816 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1817 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1818 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1819 struct ether_addr mac_host, mac_container;
1822 if (!arg_private_network)
1825 if (!arg_network_veth)
1828 /* Use two different interface name prefixes depending whether
1829 * we are in bridge mode or not. */
1830 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1831 arg_network_bridge ? "vb" : "ve", arg_machine);
1833 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1835 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1837 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1839 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1841 r = sd_rtnl_open(&rtnl, 0);
1843 return log_error_errno(r, "Failed to connect to netlink: %m");
1845 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1847 return log_error_errno(r, "Failed to allocate netlink message: %m");
1849 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1851 return log_error_errno(r, "Failed to add netlink interface name: %m");
1853 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1855 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1857 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1859 return log_error_errno(r, "Failed to open netlink container: %m");
1861 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1863 return log_error_errno(r, "Failed to open netlink container: %m");
1865 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1867 return log_error_errno(r, "Failed to open netlink container: %m");
1869 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1871 return log_error_errno(r, "Failed to add netlink interface name: %m");
1873 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1875 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1877 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1879 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1881 r = sd_rtnl_message_close_container(m);
1883 return log_error_errno(r, "Failed to close netlink container: %m");
1885 r = sd_rtnl_message_close_container(m);
1887 return log_error_errno(r, "Failed to close netlink container: %m");
1889 r = sd_rtnl_message_close_container(m);
1891 return log_error_errno(r, "Failed to close netlink container: %m");
1893 r = sd_rtnl_call(rtnl, m, 0, NULL);
1895 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1897 i = (int) if_nametoindex(iface_name);
1899 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1906 static int setup_bridge(const char veth_name[], int *ifi) {
1907 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1908 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1911 if (!arg_private_network)
1914 if (!arg_network_veth)
1917 if (!arg_network_bridge)
1920 bridge = (int) if_nametoindex(arg_network_bridge);
1922 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1926 r = sd_rtnl_open(&rtnl, 0);
1928 return log_error_errno(r, "Failed to connect to netlink: %m");
1930 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1932 return log_error_errno(r, "Failed to allocate netlink message: %m");
1934 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1936 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1938 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1940 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1942 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1944 return log_error_errno(r, "Failed to add netlink master field: %m");
1946 r = sd_rtnl_call(rtnl, m, 0, NULL);
1948 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1953 static int parse_interface(struct udev *udev, const char *name) {
1954 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1955 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1958 ifi = (int) if_nametoindex(name);
1960 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1962 sprintf(ifi_str, "n%i", ifi);
1963 d = udev_device_new_from_device_id(udev, ifi_str);
1965 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1967 if (udev_device_get_is_initialized(d) <= 0) {
1968 log_error("Network interface %s is not initialized yet.", name);
1975 static int move_network_interfaces(pid_t pid) {
1976 _cleanup_udev_unref_ struct udev *udev = NULL;
1977 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1981 if (!arg_private_network)
1984 if (strv_isempty(arg_network_interfaces))
1987 r = sd_rtnl_open(&rtnl, 0);
1989 return log_error_errno(r, "Failed to connect to netlink: %m");
1993 log_error("Failed to connect to udev.");
1997 STRV_FOREACH(i, arg_network_interfaces) {
1998 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2001 ifi = parse_interface(udev, *i);
2005 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2007 return log_error_errno(r, "Failed to allocate netlink message: %m");
2009 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2011 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2013 r = sd_rtnl_call(rtnl, m, 0, NULL);
2015 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2021 static int setup_macvlan(pid_t pid) {
2022 _cleanup_udev_unref_ struct udev *udev = NULL;
2023 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2028 if (!arg_private_network)
2031 if (strv_isempty(arg_network_macvlan))
2034 r = sd_rtnl_open(&rtnl, 0);
2036 return log_error_errno(r, "Failed to connect to netlink: %m");
2040 log_error("Failed to connect to udev.");
2044 STRV_FOREACH(i, arg_network_macvlan) {
2045 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2046 _cleanup_free_ char *n = NULL;
2047 struct ether_addr mac;
2050 ifi = parse_interface(udev, *i);
2054 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2056 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2058 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2060 return log_error_errno(r, "Failed to allocate netlink message: %m");
2062 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2064 return log_error_errno(r, "Failed to add netlink interface index: %m");
2066 n = strappend("mv-", *i);
2070 strshorten(n, IFNAMSIZ-1);
2072 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2074 return log_error_errno(r, "Failed to add netlink interface name: %m");
2076 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2078 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2080 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2082 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2084 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2086 return log_error_errno(r, "Failed to open netlink container: %m");
2088 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2090 return log_error_errno(r, "Failed to open netlink container: %m");
2092 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2094 return log_error_errno(r, "Failed to append macvlan mode: %m");
2096 r = sd_rtnl_message_close_container(m);
2098 return log_error_errno(r, "Failed to close netlink container: %m");
2100 r = sd_rtnl_message_close_container(m);
2102 return log_error_errno(r, "Failed to close netlink container: %m");
2104 r = sd_rtnl_call(rtnl, m, 0, NULL);
2106 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2112 static int setup_seccomp(void) {
2115 static const int blacklist[] = {
2116 SCMP_SYS(kexec_load),
2117 SCMP_SYS(open_by_handle_at),
2118 SCMP_SYS(init_module),
2119 SCMP_SYS(finit_module),
2120 SCMP_SYS(delete_module),
2127 scmp_filter_ctx seccomp;
2131 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2135 r = seccomp_add_secondary_archs(seccomp);
2137 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2141 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2142 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2144 continue; /* unknown syscall */
2146 log_error_errno(r, "Failed to block syscall: %m");
2152 Audit is broken in containers, much of the userspace audit
2153 hookup will fail if running inside a container. We don't
2154 care and just turn off creation of audit sockets.
2156 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2157 with EAFNOSUPPORT which audit userspace uses as indication
2158 that audit is disabled in the kernel.
2161 r = seccomp_rule_add(
2163 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2166 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2167 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2169 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2173 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2175 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2179 r = seccomp_load(seccomp);
2181 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2184 seccomp_release(seccomp);
2192 static int setup_propagate(const char *root) {
2195 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2196 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2197 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2198 (void) mkdir_p(p, 0600);
2200 q = strappenda(root, "/run/systemd/nspawn/incoming");
2201 mkdir_parents(q, 0755);
2204 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2205 return log_error_errno(errno, "Failed to install propagation bind mount.");
2207 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2208 return log_error_errno(errno, "Failed to make propagation mount read-only");
2213 static int setup_image(char **device_path, int *loop_nr) {
2214 struct loop_info64 info = {
2215 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2217 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2218 _cleanup_free_ char* loopdev = NULL;
2222 assert(device_path);
2226 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2228 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2230 if (fstat(fd, &st) < 0)
2231 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2233 if (S_ISBLK(st.st_mode)) {
2236 p = strdup(arg_image);
2250 if (!S_ISREG(st.st_mode)) {
2251 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2255 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2257 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2259 nr = ioctl(control, LOOP_CTL_GET_FREE);
2261 return log_error_errno(errno, "Failed to allocate loop device: %m");
2263 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2266 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2268 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2270 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2271 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2274 info.lo_flags |= LO_FLAGS_READ_ONLY;
2276 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2277 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2279 *device_path = loopdev;
2290 static int dissect_image(
2292 char **root_device, bool *root_device_rw,
2293 char **home_device, bool *home_device_rw,
2294 char **srv_device, bool *srv_device_rw,
2298 int home_nr = -1, srv_nr = -1;
2299 #ifdef GPT_ROOT_NATIVE
2302 #ifdef GPT_ROOT_SECONDARY
2303 int secondary_root_nr = -1;
2306 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2307 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2308 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2309 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2310 _cleanup_udev_unref_ struct udev *udev = NULL;
2311 struct udev_list_entry *first, *item;
2312 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2313 const char *pttype = NULL;
2319 assert(root_device);
2320 assert(home_device);
2325 b = blkid_new_probe();
2330 r = blkid_probe_set_device(b, fd, 0, 0);
2335 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2339 blkid_probe_enable_partitions(b, 1);
2340 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2343 r = blkid_do_safeprobe(b);
2344 if (r == -2 || r == 1) {
2345 log_error("Failed to identify any partition table on %s.\n"
2346 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2348 } else if (r != 0) {
2351 log_error_errno(errno, "Failed to probe: %m");
2355 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2356 if (!streq_ptr(pttype, "gpt")) {
2357 log_error("Image %s does not carry a GUID Partition Table.\n"
2358 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2363 pl = blkid_probe_get_partitions(b);
2368 log_error("Failed to list partitions of %s", arg_image);
2376 if (fstat(fd, &st) < 0)
2377 return log_error_errno(errno, "Failed to stat block device: %m");
2379 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2383 e = udev_enumerate_new(udev);
2387 r = udev_enumerate_add_match_parent(e, d);
2391 r = udev_enumerate_scan_devices(e);
2393 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2395 first = udev_enumerate_get_list_entry(e);
2396 udev_list_entry_foreach(item, first) {
2397 _cleanup_udev_device_unref_ struct udev_device *q;
2398 const char *stype, *node;
2399 unsigned long long flags;
2406 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2411 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2415 qn = udev_device_get_devnum(q);
2419 if (st.st_rdev == qn)
2422 node = udev_device_get_devnode(q);
2426 pp = blkid_partlist_devno_to_partition(pl, qn);
2430 flags = blkid_partition_get_flags(pp);
2431 if (flags & GPT_FLAG_NO_AUTO)
2434 nr = blkid_partition_get_partno(pp);
2438 stype = blkid_partition_get_type_string(pp);
2442 if (sd_id128_from_string(stype, &type_id) < 0)
2445 if (sd_id128_equal(type_id, GPT_HOME)) {
2447 if (home && nr >= home_nr)
2451 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2454 home = strdup(node);
2457 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2459 if (srv && nr >= srv_nr)
2463 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2470 #ifdef GPT_ROOT_NATIVE
2471 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2473 if (root && nr >= root_nr)
2477 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2480 root = strdup(node);
2485 #ifdef GPT_ROOT_SECONDARY
2486 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2488 if (secondary_root && nr >= secondary_root_nr)
2491 secondary_root_nr = nr;
2492 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2495 free(secondary_root);
2496 secondary_root = strdup(node);
2497 if (!secondary_root)
2503 if (!root && !secondary_root) {
2504 log_error("Failed to identify root partition in disk image %s.\n"
2505 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2510 *root_device = root;
2513 *root_device_rw = root_rw;
2515 } else if (secondary_root) {
2516 *root_device = secondary_root;
2517 secondary_root = NULL;
2519 *root_device_rw = secondary_root_rw;
2524 *home_device = home;
2527 *home_device_rw = home_rw;
2534 *srv_device_rw = srv_rw;
2539 log_error("--image= is not supported, compiled without blkid support.");
2544 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2546 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2547 const char *fstype, *p;
2557 p = strappenda(where, directory);
2562 b = blkid_new_probe_from_filename(what);
2566 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2570 blkid_probe_enable_superblocks(b, 1);
2571 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2574 r = blkid_do_safeprobe(b);
2575 if (r == -1 || r == 1) {
2576 log_error("Cannot determine file system type of %s", what);
2578 } else if (r != 0) {
2581 log_error_errno(errno, "Failed to probe %s: %m", what);
2586 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2589 log_error("Failed to determine file system type of %s", what);
2593 if (streq(fstype, "crypto_LUKS")) {
2594 log_error("nspawn currently does not support LUKS disk images.");
2598 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2599 return log_error_errno(errno, "Failed to mount %s: %m", what);
2603 log_error("--image= is not supported, compiled without blkid support.");
2608 static int mount_devices(
2610 const char *root_device, bool root_device_rw,
2611 const char *home_device, bool home_device_rw,
2612 const char *srv_device, bool srv_device_rw) {
2618 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2620 return log_error_errno(r, "Failed to mount root directory: %m");
2624 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2626 return log_error_errno(r, "Failed to mount home directory: %m");
2630 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2632 return log_error_errno(r, "Failed to mount server data directory: %m");
2638 static void loop_remove(int nr, int *image_fd) {
2639 _cleanup_close_ int control = -1;
2645 if (image_fd && *image_fd >= 0) {
2646 r = ioctl(*image_fd, LOOP_CLR_FD);
2648 log_warning_errno(errno, "Failed to close loop image: %m");
2649 *image_fd = safe_close(*image_fd);
2652 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2654 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2658 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2660 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2663 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2671 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2672 return log_error_errno(errno, "Failed to allocate pipe: %m");
2676 return log_error_errno(errno, "Failed to fork getent child: %m");
2677 else if (pid == 0) {
2679 char *empty_env = NULL;
2681 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2682 _exit(EXIT_FAILURE);
2684 if (pipe_fds[0] > 2)
2685 safe_close(pipe_fds[0]);
2686 if (pipe_fds[1] > 2)
2687 safe_close(pipe_fds[1]);
2689 nullfd = open("/dev/null", O_RDWR);
2691 _exit(EXIT_FAILURE);
2693 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2694 _exit(EXIT_FAILURE);
2696 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2697 _exit(EXIT_FAILURE);
2702 reset_all_signal_handlers();
2703 close_all_fds(NULL, 0);
2705 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2706 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2707 _exit(EXIT_FAILURE);
2710 pipe_fds[1] = safe_close(pipe_fds[1]);
2717 static int change_uid_gid(char **_home) {
2718 char line[LINE_MAX], *x, *u, *g, *h;
2719 const char *word, *state;
2720 _cleanup_free_ uid_t *uids = NULL;
2721 _cleanup_free_ char *home = NULL;
2722 _cleanup_fclose_ FILE *f = NULL;
2723 _cleanup_close_ int fd = -1;
2724 unsigned n_uids = 0;
2733 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2734 /* Reset everything fully to 0, just in case */
2736 if (setgroups(0, NULL) < 0)
2737 return log_error_errno(errno, "setgroups() failed: %m");
2739 if (setresgid(0, 0, 0) < 0)
2740 return log_error_errno(errno, "setregid() failed: %m");
2742 if (setresuid(0, 0, 0) < 0)
2743 return log_error_errno(errno, "setreuid() failed: %m");
2749 /* First, get user credentials */
2750 fd = spawn_getent("passwd", arg_user, &pid);
2754 f = fdopen(fd, "r");
2759 if (!fgets(line, sizeof(line), f)) {
2762 log_error("Failed to resolve user %s.", arg_user);
2766 log_error_errno(errno, "Failed to read from getent: %m");
2772 wait_for_terminate_and_warn("getent passwd", pid, true);
2774 x = strchr(line, ':');
2776 log_error("/etc/passwd entry has invalid user field.");
2780 u = strchr(x+1, ':');
2782 log_error("/etc/passwd entry has invalid password field.");
2789 log_error("/etc/passwd entry has invalid UID field.");
2797 log_error("/etc/passwd entry has invalid GID field.");
2802 h = strchr(x+1, ':');
2804 log_error("/etc/passwd entry has invalid GECOS field.");
2811 log_error("/etc/passwd entry has invalid home directory field.");
2817 r = parse_uid(u, &uid);
2819 log_error("Failed to parse UID of user.");
2823 r = parse_gid(g, &gid);
2825 log_error("Failed to parse GID of user.");
2833 /* Second, get group memberships */
2834 fd = spawn_getent("initgroups", arg_user, &pid);
2839 f = fdopen(fd, "r");
2844 if (!fgets(line, sizeof(line), f)) {
2846 log_error("Failed to resolve user %s.", arg_user);
2850 log_error_errno(errno, "Failed to read from getent: %m");
2856 wait_for_terminate_and_warn("getent initgroups", pid, true);
2858 /* Skip over the username and subsequent separator whitespace */
2860 x += strcspn(x, WHITESPACE);
2861 x += strspn(x, WHITESPACE);
2863 FOREACH_WORD(word, l, x, state) {
2869 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2872 r = parse_uid(c, &uids[n_uids++]);
2874 log_error("Failed to parse group data from getent.");
2879 r = mkdir_parents(home, 0775);
2881 return log_error_errno(r, "Failed to make home root directory: %m");
2883 r = mkdir_safe(home, 0755, uid, gid);
2884 if (r < 0 && r != -EEXIST)
2885 return log_error_errno(r, "Failed to make home directory: %m");
2887 fchown(STDIN_FILENO, uid, gid);
2888 fchown(STDOUT_FILENO, uid, gid);
2889 fchown(STDERR_FILENO, uid, gid);
2891 if (setgroups(n_uids, uids) < 0)
2892 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2894 if (setresgid(gid, gid, gid) < 0)
2895 return log_error_errno(errno, "setregid() failed: %m");
2897 if (setresuid(uid, uid, uid) < 0)
2898 return log_error_errno(errno, "setreuid() failed: %m");
2910 * < 0 : wait_for_terminate() failed to get the state of the
2911 * container, the container was terminated by a signal, or
2912 * failed for an unknown reason. No change is made to the
2913 * container argument.
2914 * > 0 : The program executed in the container terminated with an
2915 * error. The exit code of the program executed in the
2916 * container is returned. The container argument has been set
2917 * to CONTAINER_TERMINATED.
2918 * 0 : The container is being rebooted, has been shut down or exited
2919 * successfully. The container argument has been set to either
2920 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2922 * That is, success is indicated by a return value of zero, and an
2923 * error is indicated by a non-zero value.
2925 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2929 r = wait_for_terminate(pid, &status);
2931 return log_warning_errno(r, "Failed to wait for container: %m");
2933 switch (status.si_code) {
2936 if (status.si_status == 0) {
2937 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2940 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2942 *container = CONTAINER_TERMINATED;
2943 return status.si_status;
2946 if (status.si_status == SIGINT) {
2948 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2949 *container = CONTAINER_TERMINATED;
2952 } else if (status.si_status == SIGHUP) {
2954 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2955 *container = CONTAINER_REBOOTED;
2959 /* CLD_KILLED fallthrough */
2962 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2966 log_error("Container %s failed due to unknown reason.", arg_machine);
2973 static void nop_handler(int sig) {}
2975 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2978 pid = PTR_TO_UINT32(userdata);
2980 if (kill(pid, SIGRTMIN+3) >= 0) {
2981 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2982 sd_event_source_set_userdata(s, NULL);
2987 sd_event_exit(sd_event_source_get_event(s), 0);
2991 static int determine_names(void) {
2994 if (!arg_image && !arg_directory) {
2996 _cleanup_(image_unrefp) Image *i = NULL;
2998 r = image_find(arg_machine, &i);
3000 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3002 log_error("No image for machine '%s': %m", arg_machine);
3006 if (i->type == IMAGE_GPT)
3007 r = set_sanitized_path(&arg_image, i->path);
3009 r = set_sanitized_path(&arg_directory, i->path);
3011 return log_error_errno(r, "Invalid image directory: %m");
3013 arg_read_only = arg_read_only || i->read_only;
3015 arg_directory = get_current_dir_name();
3017 if (!arg_directory && !arg_machine) {
3018 log_error("Failed to determine path, please use -D or -i.");
3024 if (arg_directory && path_equal(arg_directory, "/"))
3025 arg_machine = gethostname_malloc();
3027 arg_machine = strdup(basename(arg_image ?: arg_directory));
3032 hostname_cleanup(arg_machine, false);
3033 if (!machine_name_is_valid(arg_machine)) {
3034 log_error("Failed to determine machine name automatically, please use -M.");
3038 if (arg_ephemeral) {
3041 /* Add a random suffix when this is an
3042 * ephemeral machine, so that we can run many
3043 * instances at once without manually having
3044 * to specify -M each time. */
3046 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3057 int main(int argc, char *argv[]) {
3059 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3060 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3061 _cleanup_close_ int master = -1, image_fd = -1;
3062 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
3063 _cleanup_fdset_free_ FDSet *fds = NULL;
3064 int r, n_fd_passed, loop_nr = -1;
3065 char veth_name[IFNAMSIZ];
3066 bool secondary = false, remove_subvol = false;
3067 sigset_t mask, mask_chld;
3069 int ret = EXIT_SUCCESS;
3071 log_parse_environment();
3074 r = parse_argv(argc, argv);
3078 r = determine_names();
3082 if (geteuid() != 0) {
3083 log_error("Need to be root.");
3088 if (sd_booted() <= 0) {
3089 log_error("Not running on a systemd system.");
3095 n_fd_passed = sd_listen_fds(false);
3096 if (n_fd_passed > 0) {
3097 r = fdset_new_listen_fds(&fds, false);
3099 log_error_errno(r, "Failed to collect file descriptors: %m");
3103 fdset_close_others(fds);
3106 if (arg_directory) {
3109 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3110 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3116 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3119 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3121 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3125 log_info("Populated %s from template %s.", arg_directory, arg_template);
3128 } else if (arg_ephemeral) {
3131 /* If the specified path is a mount point we
3132 * generate the new snapshot immediately
3133 * inside it under a random name. However if
3134 * the specified is not a mount point we
3135 * create the new snapshot in the parent
3136 * directory, just next to it. */
3137 r = path_is_mount_point(arg_directory, false);
3139 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3143 r = tempfn_random_child(arg_directory, &np);
3145 r = tempfn_random(arg_directory, &np);
3147 log_error_errno(r, "Failed to generate name for snapshot: %m");
3151 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3154 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3158 free(arg_directory);
3161 remove_subvol = true;
3165 if (path_is_os_tree(arg_directory) <= 0) {
3166 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3173 p = strappenda(arg_directory,
3174 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3175 if (access(p, F_OK) < 0) {
3176 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3183 char template[] = "/tmp/nspawn-root-XXXXXX";
3186 assert(!arg_template);
3188 if (!mkdtemp(template)) {
3189 log_error_errno(errno, "Failed to create temporary directory: %m");
3194 arg_directory = strdup(template);
3195 if (!arg_directory) {
3200 image_fd = setup_image(&device_path, &loop_nr);
3206 r = dissect_image(image_fd,
3207 &root_device, &root_device_rw,
3208 &home_device, &home_device_rw,
3209 &srv_device, &srv_device_rw,
3215 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3217 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3221 r = ptsname_malloc(master, &console);
3223 r = log_error_errno(r, "Failed to determine tty name: %m");
3228 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3229 arg_machine, arg_image ?: arg_directory);
3231 if (unlockpt(master) < 0) {
3232 r = log_error_errno(errno, "Failed to unlock tty: %m");
3236 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3237 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3241 assert_se(sigemptyset(&mask) == 0);
3242 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3243 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3245 assert_se(sigemptyset(&mask_chld) == 0);
3246 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3249 ContainerStatus container_status;
3250 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3251 struct sigaction sa = {
3252 .sa_handler = nop_handler,
3253 .sa_flags = SA_NOCLDSTOP,
3256 r = barrier_create(&barrier);
3258 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3262 /* Child can be killed before execv(), so handle SIGCHLD
3263 * in order to interrupt parent's blocking calls and
3264 * give it a chance to call wait() and terminate. */
3265 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3267 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3271 r = sigaction(SIGCHLD, &sa, NULL);
3273 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3277 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3278 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3279 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3281 if (errno == EINVAL)
3282 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3284 r = log_error_errno(errno, "clone() failed: %m");
3291 _cleanup_free_ char *home = NULL;
3293 const char *envp[] = {
3294 "PATH=" DEFAULT_PATH_SPLIT_USR,
3295 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3300 NULL, /* container_uuid */
3301 NULL, /* LISTEN_FDS */
3302 NULL, /* LISTEN_PID */
3307 barrier_set_role(&barrier, BARRIER_CHILD);
3309 envp[n_env] = strv_find_prefix(environ, "TERM=");
3313 master = safe_close(master);
3315 close_nointr(STDIN_FILENO);
3316 close_nointr(STDOUT_FILENO);
3317 close_nointr(STDERR_FILENO);
3319 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3321 reset_all_signal_handlers();
3322 reset_signal_mask();
3324 r = open_terminal(console, O_RDWR);
3325 if (r != STDIN_FILENO) {
3331 log_error_errno(r, "Failed to open console: %m");
3332 _exit(EXIT_FAILURE);
3335 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3336 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3337 log_error_errno(errno, "Failed to duplicate console: %m");
3338 _exit(EXIT_FAILURE);
3342 log_error_errno(errno, "setsid() failed: %m");
3343 _exit(EXIT_FAILURE);
3346 if (reset_audit_loginuid() < 0)
3347 _exit(EXIT_FAILURE);
3349 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3350 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3351 _exit(EXIT_FAILURE);
3354 /* Mark everything as slave, so that we still
3355 * receive mounts from the real root, but don't
3356 * propagate mounts to the real root. */
3357 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3358 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3359 _exit(EXIT_FAILURE);
3362 if (mount_devices(arg_directory,
3363 root_device, root_device_rw,
3364 home_device, home_device_rw,
3365 srv_device, srv_device_rw) < 0)
3366 _exit(EXIT_FAILURE);
3368 /* Turn directory into bind mount */
3369 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3370 log_error_errno(errno, "Failed to make bind mount: %m");
3371 _exit(EXIT_FAILURE);
3374 r = setup_volatile(arg_directory);
3376 _exit(EXIT_FAILURE);
3378 if (setup_volatile_state(arg_directory) < 0)
3379 _exit(EXIT_FAILURE);
3381 r = base_filesystem_create(arg_directory);
3383 _exit(EXIT_FAILURE);
3385 if (arg_read_only) {
3386 r = bind_remount_recursive(arg_directory, true);
3388 log_error_errno(r, "Failed to make tree read-only: %m");
3389 _exit(EXIT_FAILURE);
3393 if (mount_all(arg_directory) < 0)
3394 _exit(EXIT_FAILURE);
3396 if (copy_devnodes(arg_directory) < 0)
3397 _exit(EXIT_FAILURE);
3399 if (setup_ptmx(arg_directory) < 0)
3400 _exit(EXIT_FAILURE);
3402 dev_setup(arg_directory);
3404 if (setup_propagate(arg_directory) < 0)
3405 _exit(EXIT_FAILURE);
3407 if (setup_seccomp() < 0)
3408 _exit(EXIT_FAILURE);
3410 if (setup_dev_console(arg_directory, console) < 0)
3411 _exit(EXIT_FAILURE);
3413 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3414 _exit(EXIT_FAILURE);
3416 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3418 /* Tell the parent that we are ready, and that
3419 * it can cgroupify us to that we lack access
3420 * to certain devices and resources. */
3421 (void) barrier_place(&barrier);
3423 if (setup_boot_id(arg_directory) < 0)
3424 _exit(EXIT_FAILURE);
3426 if (setup_timezone(arg_directory) < 0)
3427 _exit(EXIT_FAILURE);
3429 if (setup_resolv_conf(arg_directory) < 0)
3430 _exit(EXIT_FAILURE);
3432 if (setup_journal(arg_directory) < 0)
3433 _exit(EXIT_FAILURE);
3435 if (mount_binds(arg_directory, arg_bind, false) < 0)
3436 _exit(EXIT_FAILURE);
3438 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3439 _exit(EXIT_FAILURE);
3441 if (mount_tmpfs(arg_directory) < 0)
3442 _exit(EXIT_FAILURE);
3444 /* Wait until we are cgroup-ified, so that we
3445 * can mount the right cgroup path writable */
3446 (void) barrier_sync_next(&barrier);
3448 if (mount_cgroup(arg_directory) < 0)
3449 _exit(EXIT_FAILURE);
3451 if (chdir(arg_directory) < 0) {
3452 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3453 _exit(EXIT_FAILURE);
3456 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3457 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3458 _exit(EXIT_FAILURE);
3461 if (chroot(".") < 0) {
3462 log_error_errno(errno, "chroot() failed: %m");
3463 _exit(EXIT_FAILURE);
3466 if (chdir("/") < 0) {
3467 log_error_errno(errno, "chdir() failed: %m");
3468 _exit(EXIT_FAILURE);
3473 if (arg_private_network)
3476 if (drop_capabilities() < 0) {
3477 log_error_errno(errno, "drop_capabilities() failed: %m");
3478 _exit(EXIT_FAILURE);
3481 r = change_uid_gid(&home);
3483 _exit(EXIT_FAILURE);
3485 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3486 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3487 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3489 _exit(EXIT_FAILURE);
3492 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3495 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3497 _exit(EXIT_FAILURE);
3501 if (fdset_size(fds) > 0) {
3502 r = fdset_cloexec(fds, false);
3504 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3505 _exit(EXIT_FAILURE);
3508 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3509 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3511 _exit(EXIT_FAILURE);
3517 if (arg_personality != 0xffffffffLU) {
3518 if (personality(arg_personality) < 0) {
3519 log_error_errno(errno, "personality() failed: %m");
3520 _exit(EXIT_FAILURE);
3522 } else if (secondary) {
3523 if (personality(PER_LINUX32) < 0) {
3524 log_error_errno(errno, "personality() failed: %m");
3525 _exit(EXIT_FAILURE);
3530 if (arg_selinux_context)
3531 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3532 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3533 _exit(EXIT_FAILURE);
3537 if (!strv_isempty(arg_setenv)) {
3540 n = strv_env_merge(2, envp, arg_setenv);
3543 _exit(EXIT_FAILURE);
3548 env_use = (char**) envp;
3550 /* Wait until the parent is ready with the setup, too... */
3551 if (!barrier_place_and_sync(&barrier))
3552 _exit(EXIT_FAILURE);
3558 /* Automatically search for the init system */
3560 l = 1 + argc - optind;
3561 a = newa(char*, l + 1);
3562 memcpy(a + 1, argv + optind, l * sizeof(char*));
3564 a[0] = (char*) "/usr/lib/systemd/systemd";
3565 execve(a[0], a, env_use);
3567 a[0] = (char*) "/lib/systemd/systemd";
3568 execve(a[0], a, env_use);
3570 a[0] = (char*) "/sbin/init";
3571 execve(a[0], a, env_use);
3572 } else if (argc > optind)
3573 execvpe(argv[optind], argv + optind, env_use);
3575 chdir(home ? home : "/root");
3576 execle("/bin/bash", "-bash", NULL, env_use);
3577 execle("/bin/sh", "-sh", NULL, env_use);
3580 log_error_errno(errno, "execv() failed: %m");
3581 _exit(EXIT_FAILURE);
3584 barrier_set_role(&barrier, BARRIER_PARENT);
3588 /* Wait for the most basic Child-setup to be done,
3589 * before we add hardware to it, and place it in a
3591 if (barrier_sync_next(&barrier)) {
3592 _cleanup_event_unref_ sd_event *event = NULL;
3593 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3597 r = move_network_interfaces(pid);
3601 r = setup_veth(pid, veth_name, &ifi);
3605 r = setup_bridge(veth_name, &ifi);
3609 r = setup_macvlan(pid);
3613 r = register_machine(pid, ifi);
3617 /* Block SIGCHLD here, before notifying child.
3618 * process_pty() will handle it with the other signals. */
3619 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3623 /* Reset signal to default */
3624 r = default_signals(SIGCHLD, -1);
3628 /* Notify the child that the parent is ready with all
3629 * its setup, and that the child can now hand over
3630 * control to the code to run inside the container. */
3631 (void) barrier_place(&barrier);
3633 /* And wait that the child is completely ready now. */
3634 (void) barrier_place_and_sync(&barrier);
3638 "STATUS=Container running.");
3640 r = sd_event_new(&event);
3642 log_error_errno(r, "Failed to get default event source: %m");
3647 /* Try to kill the init system on SIGINT or SIGTERM */
3648 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3649 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3651 /* Immediately exit */
3652 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3653 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3656 /* simply exit on sigchld */
3657 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3659 r = pty_forward_new(event, master, true, &forward);
3661 log_error_errno(r, "Failed to create PTY forwarder: %m");
3665 r = sd_event_loop(event);
3667 log_error_errno(r, "Failed to run event loop: %m");
3671 pty_forward_get_last_char(forward, &last_char);
3673 forward = pty_forward_free(forward);
3675 if (!arg_quiet && last_char != '\n')
3678 /* Kill if it is not dead yet anyway */
3679 terminate_machine(pid);
3682 /* Normally redundant, but better safe than sorry */
3685 r = wait_for_container(pid, &container_status);
3689 /* We failed to wait for the container, or the
3690 * container exited abnormally */
3692 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3693 /* The container exited with a non-zero
3694 * status, or with zero status and no reboot
3700 /* CONTAINER_REBOOTED, loop again */
3702 if (arg_keep_unit) {
3703 /* Special handling if we are running as a
3704 * service: instead of simply restarting the
3705 * machine we want to restart the entire
3706 * service, so let's inform systemd about this
3707 * with the special exit code 133. The service
3708 * file uses RestartForceExitStatus=133 so
3709 * that this results in a full nspawn
3710 * restart. This is necessary since we might
3711 * have cgroup parameters set we want to have
3722 "STATUS=Terminating...");
3724 loop_remove(loop_nr, &image_fd);
3729 if (remove_subvol && arg_directory) {
3732 k = btrfs_subvol_remove(arg_directory);
3734 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3740 p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
3741 (void) rm_rf(p, false, true, false);
3744 free(arg_directory);
3749 strv_free(arg_setenv);
3750 strv_free(arg_network_interfaces);
3751 strv_free(arg_network_macvlan);
3752 strv_free(arg_bind);
3753 strv_free(arg_bind_ro);
3754 strv_free(arg_tmpfs);
3756 return r < 0 ? EXIT_FAILURE : ret;