1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
49 #include <selinux/selinux.h>
57 #include <blkid/blkid.h>
60 #include "sd-daemon.h"
70 #include "cgroup-util.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
79 #include "bus-error.h"
81 #include "bus-kernel.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
88 #include "siphash24.h"
90 #include "base-filesystem.h"
92 #include "event-util.h"
94 #include "btrfs-util.h"
97 #include "seccomp-util.h"
100 typedef enum ContainerStatus {
101 CONTAINER_TERMINATED,
105 typedef enum LinkJournal {
112 typedef enum Volatile {
118 static char *arg_directory = NULL;
119 static char *arg_template = NULL;
120 static char *arg_user = NULL;
121 static sd_id128_t arg_uuid = {};
122 static char *arg_machine = NULL;
123 static const char *arg_selinux_context = NULL;
124 static const char *arg_selinux_apifs_context = NULL;
125 static const char *arg_slice = NULL;
126 static bool arg_private_network = false;
127 static bool arg_read_only = false;
128 static bool arg_boot = false;
129 static bool arg_ephemeral = false;
130 static LinkJournal arg_link_journal = LINK_AUTO;
131 static bool arg_link_journal_try = false;
132 static uint64_t arg_retain =
133 (1ULL << CAP_CHOWN) |
134 (1ULL << CAP_DAC_OVERRIDE) |
135 (1ULL << CAP_DAC_READ_SEARCH) |
136 (1ULL << CAP_FOWNER) |
137 (1ULL << CAP_FSETID) |
138 (1ULL << CAP_IPC_OWNER) |
140 (1ULL << CAP_LEASE) |
141 (1ULL << CAP_LINUX_IMMUTABLE) |
142 (1ULL << CAP_NET_BIND_SERVICE) |
143 (1ULL << CAP_NET_BROADCAST) |
144 (1ULL << CAP_NET_RAW) |
145 (1ULL << CAP_SETGID) |
146 (1ULL << CAP_SETFCAP) |
147 (1ULL << CAP_SETPCAP) |
148 (1ULL << CAP_SETUID) |
149 (1ULL << CAP_SYS_ADMIN) |
150 (1ULL << CAP_SYS_CHROOT) |
151 (1ULL << CAP_SYS_NICE) |
152 (1ULL << CAP_SYS_PTRACE) |
153 (1ULL << CAP_SYS_TTY_CONFIG) |
154 (1ULL << CAP_SYS_RESOURCE) |
155 (1ULL << CAP_SYS_BOOT) |
156 (1ULL << CAP_AUDIT_WRITE) |
157 (1ULL << CAP_AUDIT_CONTROL) |
159 static char **arg_bind = NULL;
160 static char **arg_bind_ro = NULL;
161 static char **arg_tmpfs = NULL;
162 static char **arg_setenv = NULL;
163 static bool arg_quiet = false;
164 static bool arg_share_system = false;
165 static bool arg_register = true;
166 static bool arg_keep_unit = false;
167 static char **arg_network_interfaces = NULL;
168 static char **arg_network_macvlan = NULL;
169 static bool arg_network_veth = false;
170 static const char *arg_network_bridge = NULL;
171 static unsigned long arg_personality = 0xffffffffLU;
172 static char *arg_image = NULL;
173 static Volatile arg_volatile = VOLATILE_NO;
175 static void help(void) {
176 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
178 " -h --help Show this help\n"
179 " --version Print version string\n"
180 " -q --quiet Do not show status information\n"
181 " -D --directory=PATH Root directory for the container\n"
182 " --template=PATH Initialize root directory from template directory,\n"
184 " -x --ephemeral Run container with snapshot of root directory, and\n"
185 " remove it after exit\n"
186 " -i --image=PATH File system device or disk image for the container\n"
187 " -b --boot Boot up full system (i.e. invoke init)\n"
188 " -u --user=USER Run the command under specified user or uid\n"
189 " -M --machine=NAME Set the machine name for the container\n"
190 " --uuid=UUID Set a specific machine UUID for the container\n"
191 " -S --slice=SLICE Place the container in the specified slice\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-veth Add a virtual ethernet connection between host\n"
201 " --network-bridge=INTERFACE\n"
202 " Add a virtual ethernet connection between host\n"
203 " and container and add it to an existing bridge on\n"
205 " -Z --selinux-context=SECLABEL\n"
206 " Set the SELinux security context to be used by\n"
207 " processes in the container\n"
208 " -L --selinux-apifs-context=SECLABEL\n"
209 " Set the SELinux security context to be used by\n"
210 " API/tmpfs file systems in the container\n"
211 " --capability=CAP In addition to the default, retain specified\n"
213 " --drop-capability=CAP Drop the specified capability from the default set\n"
214 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
215 " try-guest, try-host\n"
216 " -j Equivalent to --link-journal=try-guest\n"
217 " --read-only Mount the root directory read-only\n"
218 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
220 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
221 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
222 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
223 " --share-system Share system namespaces with host\n"
224 " --register=BOOLEAN Register container as machine\n"
225 " --keep-unit Do not register a scope for the machine, reuse\n"
226 " the service unit nspawn is running in\n"
227 " --volatile[=MODE] Run the system in volatile mode\n",
228 program_invocation_short_name);
231 static int set_sanitized_path(char **b, const char *path) {
237 p = canonicalize_file_name(path);
242 p = path_make_absolute_cwd(path);
248 *b = path_kill_slashes(p);
252 static int parse_argv(int argc, char *argv[]) {
269 ARG_NETWORK_INTERFACE,
278 static const struct option options[] = {
279 { "help", no_argument, NULL, 'h' },
280 { "version", no_argument, NULL, ARG_VERSION },
281 { "directory", required_argument, NULL, 'D' },
282 { "template", required_argument, NULL, ARG_TEMPLATE },
283 { "ephemeral", no_argument, NULL, 'x' },
284 { "user", required_argument, NULL, 'u' },
285 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
286 { "boot", no_argument, NULL, 'b' },
287 { "uuid", required_argument, NULL, ARG_UUID },
288 { "read-only", no_argument, NULL, ARG_READ_ONLY },
289 { "capability", required_argument, NULL, ARG_CAPABILITY },
290 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
291 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
292 { "bind", required_argument, NULL, ARG_BIND },
293 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
294 { "tmpfs", required_argument, NULL, ARG_TMPFS },
295 { "machine", required_argument, NULL, 'M' },
296 { "slice", required_argument, NULL, 'S' },
297 { "setenv", required_argument, NULL, ARG_SETENV },
298 { "selinux-context", required_argument, NULL, 'Z' },
299 { "selinux-apifs-context", required_argument, NULL, 'L' },
300 { "quiet", no_argument, NULL, 'q' },
301 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
302 { "register", required_argument, NULL, ARG_REGISTER },
303 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
304 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
305 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
306 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
307 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
308 { "personality", required_argument, NULL, ARG_PERSONALITY },
309 { "image", required_argument, NULL, 'i' },
310 { "volatile", optional_argument, NULL, ARG_VOLATILE },
315 uint64_t plus = 0, minus = 0;
320 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
329 puts(PACKAGE_STRING);
330 puts(SYSTEMD_FEATURES);
334 r = set_sanitized_path(&arg_directory, optarg);
336 return log_error_errno(r, "Invalid root directory: %m");
341 r = set_sanitized_path(&arg_template, optarg);
343 return log_error_errno(r, "Invalid template directory: %m");
348 r = set_sanitized_path(&arg_image, optarg);
350 return log_error_errno(r, "Invalid image path: %m");
355 arg_ephemeral = true;
360 arg_user = strdup(optarg);
366 case ARG_NETWORK_BRIDGE:
367 arg_network_bridge = optarg;
371 case ARG_NETWORK_VETH:
372 arg_network_veth = true;
373 arg_private_network = true;
376 case ARG_NETWORK_INTERFACE:
377 if (strv_extend(&arg_network_interfaces, optarg) < 0)
380 arg_private_network = true;
383 case ARG_NETWORK_MACVLAN:
384 if (strv_extend(&arg_network_macvlan, optarg) < 0)
389 case ARG_PRIVATE_NETWORK:
390 arg_private_network = true;
398 r = sd_id128_from_string(optarg, &arg_uuid);
400 log_error("Invalid UUID: %s", optarg);
410 if (isempty(optarg)) {
414 if (!machine_name_is_valid(optarg)) {
415 log_error("Invalid machine name: %s", optarg);
419 r = free_and_strdup(&arg_machine, optarg);
427 arg_selinux_context = optarg;
431 arg_selinux_apifs_context = optarg;
435 arg_read_only = true;
439 case ARG_DROP_CAPABILITY: {
440 const char *state, *word;
443 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
444 _cleanup_free_ char *t;
446 t = strndup(word, length);
450 if (streq(t, "all")) {
451 if (c == ARG_CAPABILITY)
452 plus = (uint64_t) -1;
454 minus = (uint64_t) -1;
458 cap = capability_from_name(t);
460 log_error("Failed to parse capability %s.", t);
464 if (c == ARG_CAPABILITY)
465 plus |= 1ULL << (uint64_t) cap;
467 minus |= 1ULL << (uint64_t) cap;
475 arg_link_journal = LINK_GUEST;
476 arg_link_journal_try = true;
479 case ARG_LINK_JOURNAL:
480 if (streq(optarg, "auto"))
481 arg_link_journal = LINK_AUTO;
482 else if (streq(optarg, "no"))
483 arg_link_journal = LINK_NO;
484 else if (streq(optarg, "guest"))
485 arg_link_journal = LINK_GUEST;
486 else if (streq(optarg, "host"))
487 arg_link_journal = LINK_HOST;
488 else if (streq(optarg, "try-guest")) {
489 arg_link_journal = LINK_GUEST;
490 arg_link_journal_try = true;
491 } else if (streq(optarg, "try-host")) {
492 arg_link_journal = LINK_HOST;
493 arg_link_journal_try = true;
495 log_error("Failed to parse link journal mode %s", optarg);
503 _cleanup_free_ char *a = NULL, *b = NULL;
507 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
509 e = strchr(optarg, ':');
511 a = strndup(optarg, e - optarg);
521 if (!path_is_absolute(a) || !path_is_absolute(b)) {
522 log_error("Invalid bind mount specification: %s", optarg);
526 r = strv_extend(x, a);
530 r = strv_extend(x, b);
538 _cleanup_free_ char *a = NULL, *b = NULL;
541 e = strchr(optarg, ':');
543 a = strndup(optarg, e - optarg);
547 b = strdup("mode=0755");
553 if (!path_is_absolute(a)) {
554 log_error("Invalid tmpfs specification: %s", optarg);
558 r = strv_push(&arg_tmpfs, a);
564 r = strv_push(&arg_tmpfs, b);
576 if (!env_assignment_is_valid(optarg)) {
577 log_error("Environment variable assignment '%s' is not valid.", optarg);
581 n = strv_env_set(arg_setenv, optarg);
585 strv_free(arg_setenv);
594 case ARG_SHARE_SYSTEM:
595 arg_share_system = true;
599 r = parse_boolean(optarg);
601 log_error("Failed to parse --register= argument: %s", optarg);
609 arg_keep_unit = true;
612 case ARG_PERSONALITY:
614 arg_personality = personality_from_string(optarg);
615 if (arg_personality == 0xffffffffLU) {
616 log_error("Unknown or unsupported personality '%s'.", optarg);
625 arg_volatile = VOLATILE_YES;
627 r = parse_boolean(optarg);
629 if (streq(optarg, "state"))
630 arg_volatile = VOLATILE_STATE;
632 log_error("Failed to parse --volatile= argument: %s", optarg);
636 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
645 assert_not_reached("Unhandled option");
648 if (arg_share_system)
649 arg_register = false;
651 if (arg_boot && arg_share_system) {
652 log_error("--boot and --share-system may not be combined.");
656 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
657 log_error("--keep-unit may not be used when invoked from a user session.");
661 if (arg_directory && arg_image) {
662 log_error("--directory= and --image= may not be combined.");
666 if (arg_template && arg_image) {
667 log_error("--template= and --image= may not be combined.");
671 if (arg_template && !(arg_directory || arg_machine)) {
672 log_error("--template= needs --directory= or --machine=.");
676 if (arg_ephemeral && arg_template) {
677 log_error("--ephemeral and --template= may not be combined.");
681 if (arg_ephemeral && arg_image) {
682 log_error("--ephemeral and --image= may not be combined.");
686 if (arg_volatile != VOLATILE_NO && arg_read_only) {
687 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
691 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
696 static int mount_all(const char *dest) {
698 typedef struct MountPoint {
707 static const MountPoint mount_table[] = {
708 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
709 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
710 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
711 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
712 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
713 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
714 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
715 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
717 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
718 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
725 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
726 _cleanup_free_ char *where = NULL;
728 _cleanup_free_ char *options = NULL;
733 where = strjoin(dest, "/", mount_table[k].where, NULL);
737 t = path_is_mount_point(where, true);
739 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
747 /* Skip this entry if it is not a remount. */
748 if (mount_table[k].what && t > 0)
751 t = mkdir_p(where, 0755);
753 if (mount_table[k].fatal) {
754 log_error_errno(t, "Failed to create directory %s: %m", where);
759 log_warning_errno(t, "Failed to create directory %s: %m", where);
765 if (arg_selinux_apifs_context &&
766 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
767 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
774 o = mount_table[k].options;
777 if (mount(mount_table[k].what,
780 mount_table[k].flags,
783 if (mount_table[k].fatal) {
784 log_error_errno(errno, "mount(%s) failed: %m", where);
789 log_warning_errno(errno, "mount(%s) failed: %m", where);
796 static int mount_binds(const char *dest, char **l, bool ro) {
799 STRV_FOREACH_PAIR(x, y, l) {
800 _cleanup_free_ char *where = NULL;
801 struct stat source_st, dest_st;
804 if (stat(*x, &source_st) < 0)
805 return log_error_errno(errno, "Failed to stat %s: %m", *x);
807 where = strappend(dest, *y);
811 r = stat(where, &dest_st);
813 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
814 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
817 } else if (errno == ENOENT) {
818 r = mkdir_parents_label(where, 0755);
820 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
822 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
826 /* Create the mount point, but be conservative -- refuse to create block
827 * and char devices. */
828 if (S_ISDIR(source_st.st_mode)) {
829 r = mkdir_label(where, 0755);
830 if (r < 0 && errno != EEXIST)
831 return log_error_errno(r, "Failed to create mount point %s: %m", where);
832 } else if (S_ISFIFO(source_st.st_mode)) {
833 r = mkfifo(where, 0644);
834 if (r < 0 && errno != EEXIST)
835 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
836 } else if (S_ISSOCK(source_st.st_mode)) {
837 r = mknod(where, 0644 | S_IFSOCK, 0);
838 if (r < 0 && errno != EEXIST)
839 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
840 } else if (S_ISREG(source_st.st_mode)) {
843 return log_error_errno(r, "Failed to create mount point %s: %m", where);
845 log_error("Refusing to create mountpoint for file: %s", *x);
849 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
850 return log_error_errno(errno, "mount(%s) failed: %m", where);
853 r = bind_remount_recursive(where, true);
855 return log_error_errno(r, "Read-Only bind mount failed: %m");
862 static int mount_tmpfs(const char *dest) {
865 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
866 _cleanup_free_ char *where = NULL;
869 where = strappend(dest, *i);
873 r = mkdir_label(where, 0755);
874 if (r < 0 && r != -EEXIST)
875 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
877 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
878 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
884 static int setup_timezone(const char *dest) {
885 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
891 /* Fix the timezone, if possible */
892 r = readlink_malloc("/etc/localtime", &p);
894 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
898 z = path_startswith(p, "../usr/share/zoneinfo/");
900 z = path_startswith(p, "/usr/share/zoneinfo/");
902 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
906 where = strappend(dest, "/etc/localtime");
910 r = readlink_malloc(where, &q);
912 y = path_startswith(q, "../usr/share/zoneinfo/");
914 y = path_startswith(q, "/usr/share/zoneinfo/");
916 /* Already pointing to the right place? Then do nothing .. */
917 if (y && streq(y, z))
921 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
925 if (access(check, F_OK) < 0) {
926 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
930 what = strappend("../usr/share/zoneinfo/", z);
934 r = mkdir_parents(where, 0755);
936 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
942 if (r < 0 && errno != ENOENT) {
943 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
948 if (symlink(what, where) < 0) {
949 log_error_errno(errno, "Failed to correct timezone of container: %m");
956 static int setup_resolv_conf(const char *dest) {
957 _cleanup_free_ char *where = NULL;
962 if (arg_private_network)
965 /* Fix resolv.conf, if possible */
966 where = strappend(dest, "/etc/resolv.conf");
970 /* We don't really care for the results of this really. If it
971 * fails, it fails, but meh... */
972 r = mkdir_parents(where, 0755);
974 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
979 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
981 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
989 static int setup_volatile_state(const char *directory) {
995 if (arg_volatile != VOLATILE_STATE)
998 /* --volatile=state means we simply overmount /var
999 with a tmpfs, and the rest read-only. */
1001 r = bind_remount_recursive(directory, true);
1003 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1005 p = strappenda(directory, "/var");
1007 if (r < 0 && errno != EEXIST)
1008 return log_error_errno(errno, "Failed to create %s: %m", directory);
1010 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1011 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1016 static int setup_volatile(const char *directory) {
1017 bool tmpfs_mounted = false, bind_mounted = false;
1018 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1024 if (arg_volatile != VOLATILE_YES)
1027 /* --volatile=yes means we mount a tmpfs to the root dir, and
1028 the original /usr to use inside it, and that read-only. */
1030 if (!mkdtemp(template))
1031 return log_error_errno(errno, "Failed to create temporary directory: %m");
1033 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1034 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1039 tmpfs_mounted = true;
1041 f = strappenda(directory, "/usr");
1042 t = strappenda(template, "/usr");
1045 if (r < 0 && errno != EEXIST) {
1046 log_error_errno(errno, "Failed to create %s: %m", t);
1051 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1052 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1057 bind_mounted = true;
1059 r = bind_remount_recursive(t, true);
1061 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1065 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1066 log_error_errno(errno, "Failed to move root mount: %m");
1084 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1087 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1088 SD_ID128_FORMAT_VAL(id));
1093 static int setup_boot_id(const char *dest) {
1094 _cleanup_free_ char *from = NULL, *to = NULL;
1095 sd_id128_t rnd = {};
1101 if (arg_share_system)
1104 /* Generate a new randomized boot ID, so that each boot-up of
1105 * the container gets a new one */
1107 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1108 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1112 r = sd_id128_randomize(&rnd);
1114 return log_error_errno(r, "Failed to generate random boot id: %m");
1116 id128_format_as_uuid(rnd, as_uuid);
1118 r = write_string_file(from, as_uuid);
1120 return log_error_errno(r, "Failed to write boot id: %m");
1122 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1123 log_error_errno(errno, "Failed to bind mount boot id: %m");
1125 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1126 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1132 static int copy_devnodes(const char *dest) {
1134 static const char devnodes[] =
1145 _cleanup_umask_ mode_t u;
1151 NULSTR_FOREACH(d, devnodes) {
1152 _cleanup_free_ char *from = NULL, *to = NULL;
1155 from = strappend("/dev/", d);
1156 to = strjoin(dest, "/dev/", d, NULL);
1160 if (stat(from, &st) < 0) {
1162 if (errno != ENOENT)
1163 return log_error_errno(errno, "Failed to stat %s: %m", from);
1165 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1167 log_error("%s is not a char or block device, cannot copy", from);
1171 r = mkdir_parents(to, 0775);
1173 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1177 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1178 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1185 static int setup_ptmx(const char *dest) {
1186 _cleanup_free_ char *p = NULL;
1188 p = strappend(dest, "/dev/ptmx");
1192 if (symlink("pts/ptmx", p) < 0)
1193 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1198 static int setup_dev_console(const char *dest, const char *console) {
1199 _cleanup_umask_ mode_t u;
1209 if (stat("/dev/null", &st) < 0)
1210 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1212 r = chmod_and_chown(console, 0600, 0, 0);
1214 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1216 /* We need to bind mount the right tty to /dev/console since
1217 * ptys can only exist on pts file systems. To have something
1218 * to bind mount things on we create a device node first, and
1219 * use /dev/null for that since we the cgroups device policy
1220 * allows us to create that freely, while we cannot create
1221 * /dev/console. (Note that the major minor doesn't actually
1222 * matter here, since we mount it over anyway). */
1224 to = strappenda(dest, "/dev/console");
1225 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1226 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1228 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1229 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1234 static int setup_kmsg(const char *dest, int kmsg_socket) {
1235 _cleanup_free_ char *from = NULL, *to = NULL;
1237 _cleanup_umask_ mode_t u;
1239 struct cmsghdr cmsghdr;
1240 uint8_t buf[CMSG_SPACE(sizeof(int))];
1242 struct msghdr mh = {
1243 .msg_control = &control,
1244 .msg_controllen = sizeof(control),
1246 struct cmsghdr *cmsg;
1249 assert(kmsg_socket >= 0);
1253 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1254 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1255 * on the reading side behave very similar to /proc/kmsg,
1256 * their writing side behaves differently from /dev/kmsg in
1257 * that writing blocks when nothing is reading. In order to
1258 * avoid any problems with containers deadlocking due to this
1259 * we simply make /dev/kmsg unavailable to the container. */
1260 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1261 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1264 if (mkfifo(from, 0600) < 0)
1265 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1267 r = chmod_and_chown(from, 0600, 0, 0);
1269 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1271 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1272 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1274 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1276 return log_error_errno(errno, "Failed to open fifo: %m");
1278 cmsg = CMSG_FIRSTHDR(&mh);
1279 cmsg->cmsg_level = SOL_SOCKET;
1280 cmsg->cmsg_type = SCM_RIGHTS;
1281 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1282 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1284 mh.msg_controllen = cmsg->cmsg_len;
1286 /* Store away the fd in the socket, so that it stays open as
1287 * long as we run the child */
1288 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1292 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1294 /* And now make the FIFO unavailable as /dev/kmsg... */
1299 static int setup_hostname(void) {
1301 if (arg_share_system)
1304 if (sethostname_idempotent(arg_machine) < 0)
1310 static int setup_journal(const char *directory) {
1311 sd_id128_t machine_id, this_id;
1312 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1316 p = strappend(directory, "/etc/machine-id");
1320 r = read_one_line_file(p, &b);
1321 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1324 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1327 if (isempty(id) && arg_link_journal == LINK_AUTO)
1330 /* Verify validity */
1331 r = sd_id128_from_string(id, &machine_id);
1333 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1335 r = sd_id128_get_machine(&this_id);
1337 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1339 if (sd_id128_equal(machine_id, this_id)) {
1340 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1341 "Host and machine ids are equal (%s): refusing to link journals", id);
1342 if (arg_link_journal == LINK_AUTO)
1348 if (arg_link_journal == LINK_NO)
1352 p = strappend("/var/log/journal/", id);
1353 q = strjoin(directory, "/var/log/journal/", id, NULL);
1357 if (path_is_mount_point(p, false) > 0) {
1358 if (arg_link_journal != LINK_AUTO) {
1359 log_error("%s: already a mount point, refusing to use for journal", p);
1366 if (path_is_mount_point(q, false) > 0) {
1367 if (arg_link_journal != LINK_AUTO) {
1368 log_error("%s: already a mount point, refusing to use for journal", q);
1375 r = readlink_and_make_absolute(p, &d);
1377 if ((arg_link_journal == LINK_GUEST ||
1378 arg_link_journal == LINK_AUTO) &&
1381 r = mkdir_p(q, 0755);
1383 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1388 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1389 } else if (r == -EINVAL) {
1391 if (arg_link_journal == LINK_GUEST &&
1394 if (errno == ENOTDIR) {
1395 log_error("%s already exists and is neither a symlink nor a directory", p);
1398 log_error_errno(errno, "Failed to remove %s: %m", p);
1402 } else if (r != -ENOENT) {
1403 log_error_errno(errno, "readlink(%s) failed: %m", p);
1407 if (arg_link_journal == LINK_GUEST) {
1409 if (symlink(q, p) < 0) {
1410 if (arg_link_journal_try) {
1411 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1414 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1419 r = mkdir_p(q, 0755);
1421 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1425 if (arg_link_journal == LINK_HOST) {
1426 /* don't create parents here -- if the host doesn't have
1427 * permanent journal set up, don't force it here */
1430 if (arg_link_journal_try) {
1431 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1434 log_error_errno(errno, "Failed to create %s: %m", p);
1439 } else if (access(p, F_OK) < 0)
1442 if (dir_is_empty(q) == 0)
1443 log_warning("%s is not empty, proceeding anyway.", q);
1445 r = mkdir_p(q, 0755);
1447 log_error_errno(errno, "Failed to create %s: %m", q);
1451 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1452 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1457 static int drop_capabilities(void) {
1458 return capability_bounding_set_drop(~arg_retain, false);
1461 static int register_machine(pid_t pid, int local_ifindex) {
1462 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1463 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1469 r = sd_bus_default_system(&bus);
1471 return log_error_errno(r, "Failed to open system bus: %m");
1473 if (arg_keep_unit) {
1474 r = sd_bus_call_method(
1476 "org.freedesktop.machine1",
1477 "/org/freedesktop/machine1",
1478 "org.freedesktop.machine1.Manager",
1479 "RegisterMachineWithNetwork",
1484 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1488 strempty(arg_directory),
1489 local_ifindex > 0 ? 1 : 0, local_ifindex);
1491 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1493 r = sd_bus_message_new_method_call(
1496 "org.freedesktop.machine1",
1497 "/org/freedesktop/machine1",
1498 "org.freedesktop.machine1.Manager",
1499 "CreateMachineWithNetwork");
1501 return log_error_errno(r, "Failed to create message: %m");
1503 r = sd_bus_message_append(
1507 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1511 strempty(arg_directory),
1512 local_ifindex > 0 ? 1 : 0, local_ifindex);
1514 return log_error_errno(r, "Failed to append message arguments: %m");
1516 r = sd_bus_message_open_container(m, 'a', "(sv)");
1518 return log_error_errno(r, "Failed to open container: %m");
1520 if (!isempty(arg_slice)) {
1521 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1523 return log_error_errno(r, "Failed to append slice: %m");
1526 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1528 return log_error_errno(r, "Failed to add device policy: %m");
1530 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1531 /* Allow the container to
1532 * access and create the API
1533 * device nodes, so that
1534 * PrivateDevices= in the
1535 * container can work
1540 "/dev/random", "rwm",
1541 "/dev/urandom", "rwm",
1543 "/dev/net/tun", "rwm",
1544 /* Allow the container
1545 * access to ptys. However,
1547 * container to ever create
1548 * these device nodes. */
1549 "/dev/pts/ptmx", "rw",
1552 return log_error_errno(r, "Failed to add device whitelist: %m");
1554 r = sd_bus_message_close_container(m);
1556 return log_error_errno(r, "Failed to close container: %m");
1558 r = sd_bus_call(bus, m, 0, &error, NULL);
1562 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1569 static int terminate_machine(pid_t pid) {
1570 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1571 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1572 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1579 r = sd_bus_default_system(&bus);
1581 return log_error_errno(r, "Failed to open system bus: %m");
1583 r = sd_bus_call_method(
1585 "org.freedesktop.machine1",
1586 "/org/freedesktop/machine1",
1587 "org.freedesktop.machine1.Manager",
1594 /* Note that the machine might already have been
1595 * cleaned up automatically, hence don't consider it a
1596 * failure if we cannot get the machine object. */
1597 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1601 r = sd_bus_message_read(reply, "o", &path);
1603 return bus_log_parse_error(r);
1605 r = sd_bus_call_method(
1607 "org.freedesktop.machine1",
1609 "org.freedesktop.machine1.Machine",
1615 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1622 static int reset_audit_loginuid(void) {
1623 _cleanup_free_ char *p = NULL;
1626 if (arg_share_system)
1629 r = read_one_line_file("/proc/self/loginuid", &p);
1633 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1635 /* Already reset? */
1636 if (streq(p, "4294967295"))
1639 r = write_string_file("/proc/self/loginuid", "4294967295");
1641 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1642 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1643 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1644 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1645 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1653 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1654 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1655 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1657 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1663 l = strlen(arg_machine);
1664 sz = sizeof(sd_id128_t) + l;
1670 /* fetch some persistent data unique to the host */
1671 r = sd_id128_get_machine((sd_id128_t*) v);
1675 /* combine with some data unique (on this host) to this
1676 * container instance */
1677 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1680 memcpy(i, &idx, sizeof(idx));
1683 /* Let's hash the host machine ID plus the container name. We
1684 * use a fixed, but originally randomly created hash key here. */
1685 siphash24(result, v, sz, hash_key.bytes);
1687 assert_cc(ETH_ALEN <= sizeof(result));
1688 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1690 /* see eth_random_addr in the kernel */
1691 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1692 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1697 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1698 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1699 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1700 struct ether_addr mac_host, mac_container;
1703 if (!arg_private_network)
1706 if (!arg_network_veth)
1709 /* Use two different interface name prefixes depending whether
1710 * we are in bridge mode or not. */
1711 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1712 arg_network_bridge ? "vb" : "ve", arg_machine);
1714 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1716 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1718 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1720 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1722 r = sd_rtnl_open(&rtnl, 0);
1724 return log_error_errno(r, "Failed to connect to netlink: %m");
1726 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1728 return log_error_errno(r, "Failed to allocate netlink message: %m");
1730 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1732 return log_error_errno(r, "Failed to add netlink interface name: %m");
1734 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1736 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1738 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1740 return log_error_errno(r, "Failed to open netlink container: %m");
1742 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1744 return log_error_errno(r, "Failed to open netlink container: %m");
1746 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1748 return log_error_errno(r, "Failed to open netlink container: %m");
1750 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1752 return log_error_errno(r, "Failed to add netlink interface name: %m");
1754 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1756 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1758 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1760 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1762 r = sd_rtnl_message_close_container(m);
1764 return log_error_errno(r, "Failed to close netlink container: %m");
1766 r = sd_rtnl_message_close_container(m);
1768 return log_error_errno(r, "Failed to close netlink container: %m");
1770 r = sd_rtnl_message_close_container(m);
1772 return log_error_errno(r, "Failed to close netlink container: %m");
1774 r = sd_rtnl_call(rtnl, m, 0, NULL);
1776 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1778 i = (int) if_nametoindex(iface_name);
1780 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1787 static int setup_bridge(const char veth_name[], int *ifi) {
1788 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1789 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1792 if (!arg_private_network)
1795 if (!arg_network_veth)
1798 if (!arg_network_bridge)
1801 bridge = (int) if_nametoindex(arg_network_bridge);
1803 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1807 r = sd_rtnl_open(&rtnl, 0);
1809 return log_error_errno(r, "Failed to connect to netlink: %m");
1811 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1813 return log_error_errno(r, "Failed to allocate netlink message: %m");
1815 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1817 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1819 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1821 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1823 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1825 return log_error_errno(r, "Failed to add netlink master field: %m");
1827 r = sd_rtnl_call(rtnl, m, 0, NULL);
1829 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1834 static int parse_interface(struct udev *udev, const char *name) {
1835 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1836 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1839 ifi = (int) if_nametoindex(name);
1841 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1843 sprintf(ifi_str, "n%i", ifi);
1844 d = udev_device_new_from_device_id(udev, ifi_str);
1846 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1848 if (udev_device_get_is_initialized(d) <= 0) {
1849 log_error("Network interface %s is not initialized yet.", name);
1856 static int move_network_interfaces(pid_t pid) {
1857 _cleanup_udev_unref_ struct udev *udev = NULL;
1858 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1862 if (!arg_private_network)
1865 if (strv_isempty(arg_network_interfaces))
1868 r = sd_rtnl_open(&rtnl, 0);
1870 return log_error_errno(r, "Failed to connect to netlink: %m");
1874 log_error("Failed to connect to udev.");
1878 STRV_FOREACH(i, arg_network_interfaces) {
1879 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1882 ifi = parse_interface(udev, *i);
1886 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1888 return log_error_errno(r, "Failed to allocate netlink message: %m");
1890 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1892 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1894 r = sd_rtnl_call(rtnl, m, 0, NULL);
1896 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1902 static int setup_macvlan(pid_t pid) {
1903 _cleanup_udev_unref_ struct udev *udev = NULL;
1904 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1909 if (!arg_private_network)
1912 if (strv_isempty(arg_network_macvlan))
1915 r = sd_rtnl_open(&rtnl, 0);
1917 return log_error_errno(r, "Failed to connect to netlink: %m");
1921 log_error("Failed to connect to udev.");
1925 STRV_FOREACH(i, arg_network_macvlan) {
1926 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1927 _cleanup_free_ char *n = NULL;
1928 struct ether_addr mac;
1931 ifi = parse_interface(udev, *i);
1935 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1937 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1939 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1941 return log_error_errno(r, "Failed to allocate netlink message: %m");
1943 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1945 return log_error_errno(r, "Failed to add netlink interface index: %m");
1947 n = strappend("mv-", *i);
1951 strshorten(n, IFNAMSIZ-1);
1953 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1955 return log_error_errno(r, "Failed to add netlink interface name: %m");
1957 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1959 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1961 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1963 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1965 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1967 return log_error_errno(r, "Failed to open netlink container: %m");
1969 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1971 return log_error_errno(r, "Failed to open netlink container: %m");
1973 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1975 return log_error_errno(r, "Failed to append macvlan mode: %m");
1977 r = sd_rtnl_message_close_container(m);
1979 return log_error_errno(r, "Failed to close netlink container: %m");
1981 r = sd_rtnl_message_close_container(m);
1983 return log_error_errno(r, "Failed to close netlink container: %m");
1985 r = sd_rtnl_call(rtnl, m, 0, NULL);
1987 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1993 static int setup_seccomp(void) {
1996 static const int blacklist[] = {
1997 SCMP_SYS(kexec_load),
1998 SCMP_SYS(open_by_handle_at),
1999 SCMP_SYS(init_module),
2000 SCMP_SYS(finit_module),
2001 SCMP_SYS(delete_module),
2008 scmp_filter_ctx seccomp;
2012 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2016 r = seccomp_add_secondary_archs(seccomp);
2018 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2022 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2023 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2025 continue; /* unknown syscall */
2027 log_error_errno(r, "Failed to block syscall: %m");
2033 Audit is broken in containers, much of the userspace audit
2034 hookup will fail if running inside a container. We don't
2035 care and just turn off creation of audit sockets.
2037 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2038 with EAFNOSUPPORT which audit userspace uses as indication
2039 that audit is disabled in the kernel.
2042 r = seccomp_rule_add(
2044 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2047 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2048 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2050 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2054 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2056 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2060 r = seccomp_load(seccomp);
2062 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2065 seccomp_release(seccomp);
2073 static int setup_image(char **device_path, int *loop_nr) {
2074 struct loop_info64 info = {
2075 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2077 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2078 _cleanup_free_ char* loopdev = NULL;
2082 assert(device_path);
2086 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2088 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2090 if (fstat(fd, &st) < 0)
2091 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2093 if (S_ISBLK(st.st_mode)) {
2096 p = strdup(arg_image);
2110 if (!S_ISREG(st.st_mode)) {
2111 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2115 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2117 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2119 nr = ioctl(control, LOOP_CTL_GET_FREE);
2121 return log_error_errno(errno, "Failed to allocate loop device: %m");
2123 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2126 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2128 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2130 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2131 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2134 info.lo_flags |= LO_FLAGS_READ_ONLY;
2136 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2137 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2139 *device_path = loopdev;
2150 static int dissect_image(
2152 char **root_device, bool *root_device_rw,
2153 char **home_device, bool *home_device_rw,
2154 char **srv_device, bool *srv_device_rw,
2158 int home_nr = -1, srv_nr = -1;
2159 #ifdef GPT_ROOT_NATIVE
2162 #ifdef GPT_ROOT_SECONDARY
2163 int secondary_root_nr = -1;
2166 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2167 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2168 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2169 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2170 _cleanup_udev_unref_ struct udev *udev = NULL;
2171 struct udev_list_entry *first, *item;
2172 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2173 const char *pttype = NULL;
2179 assert(root_device);
2180 assert(home_device);
2185 b = blkid_new_probe();
2190 r = blkid_probe_set_device(b, fd, 0, 0);
2195 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2199 blkid_probe_enable_partitions(b, 1);
2200 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2203 r = blkid_do_safeprobe(b);
2204 if (r == -2 || r == 1) {
2205 log_error("Failed to identify any partition table on %s.\n"
2206 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2208 } else if (r != 0) {
2211 log_error_errno(errno, "Failed to probe: %m");
2215 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2216 if (!streq_ptr(pttype, "gpt")) {
2217 log_error("Image %s does not carry a GUID Partition Table.\n"
2218 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2223 pl = blkid_probe_get_partitions(b);
2228 log_error("Failed to list partitions of %s", arg_image);
2236 if (fstat(fd, &st) < 0)
2237 return log_error_errno(errno, "Failed to stat block device: %m");
2239 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2243 e = udev_enumerate_new(udev);
2247 r = udev_enumerate_add_match_parent(e, d);
2251 r = udev_enumerate_scan_devices(e);
2253 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2255 first = udev_enumerate_get_list_entry(e);
2256 udev_list_entry_foreach(item, first) {
2257 _cleanup_udev_device_unref_ struct udev_device *q;
2258 const char *stype, *node;
2259 unsigned long long flags;
2266 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2271 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2275 qn = udev_device_get_devnum(q);
2279 if (st.st_rdev == qn)
2282 node = udev_device_get_devnode(q);
2286 pp = blkid_partlist_devno_to_partition(pl, qn);
2290 flags = blkid_partition_get_flags(pp);
2291 if (flags & GPT_FLAG_NO_AUTO)
2294 nr = blkid_partition_get_partno(pp);
2298 stype = blkid_partition_get_type_string(pp);
2302 if (sd_id128_from_string(stype, &type_id) < 0)
2305 if (sd_id128_equal(type_id, GPT_HOME)) {
2307 if (home && nr >= home_nr)
2311 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2314 home = strdup(node);
2317 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2319 if (srv && nr >= srv_nr)
2323 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2330 #ifdef GPT_ROOT_NATIVE
2331 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2333 if (root && nr >= root_nr)
2337 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2340 root = strdup(node);
2345 #ifdef GPT_ROOT_SECONDARY
2346 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2348 if (secondary_root && nr >= secondary_root_nr)
2351 secondary_root_nr = nr;
2352 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2355 free(secondary_root);
2356 secondary_root = strdup(node);
2357 if (!secondary_root)
2363 if (!root && !secondary_root) {
2364 log_error("Failed to identify root partition in disk image %s.\n"
2365 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2370 *root_device = root;
2373 *root_device_rw = root_rw;
2375 } else if (secondary_root) {
2376 *root_device = secondary_root;
2377 secondary_root = NULL;
2379 *root_device_rw = secondary_root_rw;
2384 *home_device = home;
2387 *home_device_rw = home_rw;
2394 *srv_device_rw = srv_rw;
2399 log_error("--image= is not supported, compiled without blkid support.");
2404 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2406 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2407 const char *fstype, *p;
2417 p = strappenda(where, directory);
2422 b = blkid_new_probe_from_filename(what);
2426 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2430 blkid_probe_enable_superblocks(b, 1);
2431 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2434 r = blkid_do_safeprobe(b);
2435 if (r == -1 || r == 1) {
2436 log_error("Cannot determine file system type of %s", what);
2438 } else if (r != 0) {
2441 log_error_errno(errno, "Failed to probe %s: %m", what);
2446 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2449 log_error("Failed to determine file system type of %s", what);
2453 if (streq(fstype, "crypto_LUKS")) {
2454 log_error("nspawn currently does not support LUKS disk images.");
2458 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2459 return log_error_errno(errno, "Failed to mount %s: %m", what);
2463 log_error("--image= is not supported, compiled without blkid support.");
2468 static int mount_devices(
2470 const char *root_device, bool root_device_rw,
2471 const char *home_device, bool home_device_rw,
2472 const char *srv_device, bool srv_device_rw) {
2478 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2480 return log_error_errno(r, "Failed to mount root directory: %m");
2484 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2486 return log_error_errno(r, "Failed to mount home directory: %m");
2490 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2492 return log_error_errno(r, "Failed to mount server data directory: %m");
2498 static void loop_remove(int nr, int *image_fd) {
2499 _cleanup_close_ int control = -1;
2505 if (image_fd && *image_fd >= 0) {
2506 r = ioctl(*image_fd, LOOP_CLR_FD);
2508 log_warning_errno(errno, "Failed to close loop image: %m");
2509 *image_fd = safe_close(*image_fd);
2512 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2514 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2518 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2520 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2523 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2531 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2532 return log_error_errno(errno, "Failed to allocate pipe: %m");
2536 return log_error_errno(errno, "Failed to fork getent child: %m");
2537 else if (pid == 0) {
2539 char *empty_env = NULL;
2541 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2542 _exit(EXIT_FAILURE);
2544 if (pipe_fds[0] > 2)
2545 safe_close(pipe_fds[0]);
2546 if (pipe_fds[1] > 2)
2547 safe_close(pipe_fds[1]);
2549 nullfd = open("/dev/null", O_RDWR);
2551 _exit(EXIT_FAILURE);
2553 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2554 _exit(EXIT_FAILURE);
2556 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2557 _exit(EXIT_FAILURE);
2562 reset_all_signal_handlers();
2563 close_all_fds(NULL, 0);
2565 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2566 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2567 _exit(EXIT_FAILURE);
2570 pipe_fds[1] = safe_close(pipe_fds[1]);
2577 static int change_uid_gid(char **_home) {
2578 char line[LINE_MAX], *x, *u, *g, *h;
2579 const char *word, *state;
2580 _cleanup_free_ uid_t *uids = NULL;
2581 _cleanup_free_ char *home = NULL;
2582 _cleanup_fclose_ FILE *f = NULL;
2583 _cleanup_close_ int fd = -1;
2584 unsigned n_uids = 0;
2593 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2594 /* Reset everything fully to 0, just in case */
2596 if (setgroups(0, NULL) < 0)
2597 return log_error_errno(errno, "setgroups() failed: %m");
2599 if (setresgid(0, 0, 0) < 0)
2600 return log_error_errno(errno, "setregid() failed: %m");
2602 if (setresuid(0, 0, 0) < 0)
2603 return log_error_errno(errno, "setreuid() failed: %m");
2609 /* First, get user credentials */
2610 fd = spawn_getent("passwd", arg_user, &pid);
2614 f = fdopen(fd, "r");
2619 if (!fgets(line, sizeof(line), f)) {
2622 log_error("Failed to resolve user %s.", arg_user);
2626 log_error_errno(errno, "Failed to read from getent: %m");
2632 wait_for_terminate_and_warn("getent passwd", pid, true);
2634 x = strchr(line, ':');
2636 log_error("/etc/passwd entry has invalid user field.");
2640 u = strchr(x+1, ':');
2642 log_error("/etc/passwd entry has invalid password field.");
2649 log_error("/etc/passwd entry has invalid UID field.");
2657 log_error("/etc/passwd entry has invalid GID field.");
2662 h = strchr(x+1, ':');
2664 log_error("/etc/passwd entry has invalid GECOS field.");
2671 log_error("/etc/passwd entry has invalid home directory field.");
2677 r = parse_uid(u, &uid);
2679 log_error("Failed to parse UID of user.");
2683 r = parse_gid(g, &gid);
2685 log_error("Failed to parse GID of user.");
2693 /* Second, get group memberships */
2694 fd = spawn_getent("initgroups", arg_user, &pid);
2699 f = fdopen(fd, "r");
2704 if (!fgets(line, sizeof(line), f)) {
2706 log_error("Failed to resolve user %s.", arg_user);
2710 log_error_errno(errno, "Failed to read from getent: %m");
2716 wait_for_terminate_and_warn("getent initgroups", pid, true);
2718 /* Skip over the username and subsequent separator whitespace */
2720 x += strcspn(x, WHITESPACE);
2721 x += strspn(x, WHITESPACE);
2723 FOREACH_WORD(word, l, x, state) {
2729 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2732 r = parse_uid(c, &uids[n_uids++]);
2734 log_error("Failed to parse group data from getent.");
2739 r = mkdir_parents(home, 0775);
2741 return log_error_errno(r, "Failed to make home root directory: %m");
2743 r = mkdir_safe(home, 0755, uid, gid);
2744 if (r < 0 && r != -EEXIST)
2745 return log_error_errno(r, "Failed to make home directory: %m");
2747 fchown(STDIN_FILENO, uid, gid);
2748 fchown(STDOUT_FILENO, uid, gid);
2749 fchown(STDERR_FILENO, uid, gid);
2751 if (setgroups(n_uids, uids) < 0)
2752 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2754 if (setresgid(gid, gid, gid) < 0)
2755 return log_error_errno(errno, "setregid() failed: %m");
2757 if (setresuid(uid, uid, uid) < 0)
2758 return log_error_errno(errno, "setreuid() failed: %m");
2770 * < 0 : wait_for_terminate() failed to get the state of the
2771 * container, the container was terminated by a signal, or
2772 * failed for an unknown reason. No change is made to the
2773 * container argument.
2774 * > 0 : The program executed in the container terminated with an
2775 * error. The exit code of the program executed in the
2776 * container is returned. The container argument has been set
2777 * to CONTAINER_TERMINATED.
2778 * 0 : The container is being rebooted, has been shut down or exited
2779 * successfully. The container argument has been set to either
2780 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2782 * That is, success is indicated by a return value of zero, and an
2783 * error is indicated by a non-zero value.
2785 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2789 r = wait_for_terminate(pid, &status);
2791 return log_warning_errno(r, "Failed to wait for container: %m");
2793 switch (status.si_code) {
2796 if (status.si_status == 0) {
2797 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2800 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2802 *container = CONTAINER_TERMINATED;
2803 return status.si_status;
2806 if (status.si_status == SIGINT) {
2808 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2809 *container = CONTAINER_TERMINATED;
2812 } else if (status.si_status == SIGHUP) {
2814 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2815 *container = CONTAINER_REBOOTED;
2819 /* CLD_KILLED fallthrough */
2822 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2826 log_error("Container %s failed due to unknown reason.", arg_machine);
2833 static void nop_handler(int sig) {}
2835 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2838 pid = PTR_TO_UINT32(userdata);
2840 if (kill(pid, SIGRTMIN+3) >= 0) {
2841 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2842 sd_event_source_set_userdata(s, NULL);
2847 sd_event_exit(sd_event_source_get_event(s), 0);
2851 static int determine_names(void) {
2853 if (!arg_image && !arg_directory) {
2855 arg_directory = strappend("/var/lib/container/", arg_machine);
2857 arg_directory = get_current_dir_name();
2859 if (!arg_directory) {
2860 log_error("Failed to determine path, please use -D.");
2866 arg_machine = strdup(basename(arg_image ?: arg_directory));
2870 hostname_cleanup(arg_machine, false);
2871 if (!machine_name_is_valid(arg_machine)) {
2872 log_error("Failed to determine machine name automatically, please use -M.");
2880 int main(int argc, char *argv[]) {
2882 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2883 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2884 _cleanup_close_ int master = -1, image_fd = -1;
2885 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2886 _cleanup_fdset_free_ FDSet *fds = NULL;
2887 int r, n_fd_passed, loop_nr = -1;
2888 const char *console = NULL;
2889 char veth_name[IFNAMSIZ];
2890 bool secondary = false, remove_subvol = false;
2891 sigset_t mask, mask_chld;
2893 int ret = EXIT_SUCCESS;
2895 log_parse_environment();
2898 r = parse_argv(argc, argv);
2902 r = determine_names();
2906 if (geteuid() != 0) {
2907 log_error("Need to be root.");
2912 if (sd_booted() <= 0) {
2913 log_error("Not running on a systemd system.");
2919 n_fd_passed = sd_listen_fds(false);
2920 if (n_fd_passed > 0) {
2921 r = fdset_new_listen_fds(&fds, false);
2923 log_error_errno(r, "Failed to collect file descriptors: %m");
2927 fdset_close_others(fds);
2930 if (arg_directory) {
2933 if (path_equal(arg_directory, "/")) {
2934 log_error("Spawning container on root directory not supported.");
2940 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
2943 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
2945 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
2949 log_info("Populated %s from template %s.", arg_directory, arg_template);
2952 } else if (arg_ephemeral) {
2955 r = tempfn_random(arg_directory, &np);
2957 log_error_errno(r, "Failed to generate name for snapshot: %m");
2961 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
2964 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
2968 free(arg_directory);
2971 remove_subvol = true;
2975 if (path_is_os_tree(arg_directory) <= 0) {
2976 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2983 p = strappenda(arg_directory,
2984 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2985 if (access(p, F_OK) < 0) {
2986 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2993 char template[] = "/tmp/nspawn-root-XXXXXX";
2996 assert(!arg_template);
2998 if (!mkdtemp(template)) {
2999 log_error_errno(errno, "Failed to create temporary directory: %m");
3004 arg_directory = strdup(template);
3005 if (!arg_directory) {
3010 image_fd = setup_image(&device_path, &loop_nr);
3016 r = dissect_image(image_fd,
3017 &root_device, &root_device_rw,
3018 &home_device, &home_device_rw,
3019 &srv_device, &srv_device_rw,
3025 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3027 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3031 console = ptsname(master);
3033 r = log_error_errno(errno, "Failed to determine tty name: %m");
3038 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3039 arg_machine, arg_image ?: arg_directory);
3041 if (unlockpt(master) < 0) {
3042 r = log_error_errno(errno, "Failed to unlock tty: %m");
3046 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3047 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3053 "STATUS=Container running.");
3055 assert_se(sigemptyset(&mask) == 0);
3056 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3057 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3059 assert_se(sigemptyset(&mask_chld) == 0);
3060 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3063 ContainerStatus container_status;
3064 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3065 struct sigaction sa = {
3066 .sa_handler = nop_handler,
3067 .sa_flags = SA_NOCLDSTOP,
3070 r = barrier_create(&barrier);
3072 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3076 /* Child can be killed before execv(), so handle SIGCHLD
3077 * in order to interrupt parent's blocking calls and
3078 * give it a chance to call wait() and terminate. */
3079 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3081 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3085 r = sigaction(SIGCHLD, &sa, NULL);
3087 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3091 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3092 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3093 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3095 if (errno == EINVAL)
3096 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3098 r = log_error_errno(errno, "clone() failed: %m");
3105 _cleanup_free_ char *home = NULL;
3107 const char *envp[] = {
3108 "PATH=" DEFAULT_PATH_SPLIT_USR,
3109 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3114 NULL, /* container_uuid */
3115 NULL, /* LISTEN_FDS */
3116 NULL, /* LISTEN_PID */
3121 barrier_set_role(&barrier, BARRIER_CHILD);
3123 envp[n_env] = strv_find_prefix(environ, "TERM=");
3127 master = safe_close(master);
3129 close_nointr(STDIN_FILENO);
3130 close_nointr(STDOUT_FILENO);
3131 close_nointr(STDERR_FILENO);
3133 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3135 reset_all_signal_handlers();
3136 reset_signal_mask();
3138 r = open_terminal(console, O_RDWR);
3139 if (r != STDIN_FILENO) {
3145 log_error_errno(r, "Failed to open console: %m");
3146 _exit(EXIT_FAILURE);
3149 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3150 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3151 log_error_errno(errno, "Failed to duplicate console: %m");
3152 _exit(EXIT_FAILURE);
3156 log_error_errno(errno, "setsid() failed: %m");
3157 _exit(EXIT_FAILURE);
3160 if (reset_audit_loginuid() < 0)
3161 _exit(EXIT_FAILURE);
3163 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3164 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3165 _exit(EXIT_FAILURE);
3168 /* Mark everything as slave, so that we still
3169 * receive mounts from the real root, but don't
3170 * propagate mounts to the real root. */
3171 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3172 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3173 _exit(EXIT_FAILURE);
3176 if (mount_devices(arg_directory,
3177 root_device, root_device_rw,
3178 home_device, home_device_rw,
3179 srv_device, srv_device_rw) < 0)
3180 _exit(EXIT_FAILURE);
3182 /* Turn directory into bind mount */
3183 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3184 log_error_errno(errno, "Failed to make bind mount: %m");
3185 _exit(EXIT_FAILURE);
3188 r = setup_volatile(arg_directory);
3190 _exit(EXIT_FAILURE);
3192 if (setup_volatile_state(arg_directory) < 0)
3193 _exit(EXIT_FAILURE);
3195 r = base_filesystem_create(arg_directory);
3197 _exit(EXIT_FAILURE);
3199 if (arg_read_only) {
3200 r = bind_remount_recursive(arg_directory, true);
3202 log_error_errno(r, "Failed to make tree read-only: %m");
3203 _exit(EXIT_FAILURE);
3207 if (mount_all(arg_directory) < 0)
3208 _exit(EXIT_FAILURE);
3210 if (copy_devnodes(arg_directory) < 0)
3211 _exit(EXIT_FAILURE);
3213 if (setup_ptmx(arg_directory) < 0)
3214 _exit(EXIT_FAILURE);
3216 dev_setup(arg_directory);
3218 if (setup_seccomp() < 0)
3219 _exit(EXIT_FAILURE);
3221 if (setup_dev_console(arg_directory, console) < 0)
3222 _exit(EXIT_FAILURE);
3224 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3225 _exit(EXIT_FAILURE);
3227 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3229 if (setup_boot_id(arg_directory) < 0)
3230 _exit(EXIT_FAILURE);
3232 if (setup_timezone(arg_directory) < 0)
3233 _exit(EXIT_FAILURE);
3235 if (setup_resolv_conf(arg_directory) < 0)
3236 _exit(EXIT_FAILURE);
3238 if (setup_journal(arg_directory) < 0)
3239 _exit(EXIT_FAILURE);
3241 if (mount_binds(arg_directory, arg_bind, false) < 0)
3242 _exit(EXIT_FAILURE);
3244 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3245 _exit(EXIT_FAILURE);
3247 if (mount_tmpfs(arg_directory) < 0)
3248 _exit(EXIT_FAILURE);
3250 /* Tell the parent that we are ready, and that
3251 * it can cgroupify us to that we lack access
3252 * to certain devices and resources. */
3253 (void)barrier_place(&barrier);
3255 if (chdir(arg_directory) < 0) {
3256 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3257 _exit(EXIT_FAILURE);
3260 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3261 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3262 _exit(EXIT_FAILURE);
3265 if (chroot(".") < 0) {
3266 log_error_errno(errno, "chroot() failed: %m");
3267 _exit(EXIT_FAILURE);
3270 if (chdir("/") < 0) {
3271 log_error_errno(errno, "chdir() failed: %m");
3272 _exit(EXIT_FAILURE);
3277 if (arg_private_network)
3280 if (drop_capabilities() < 0) {
3281 log_error_errno(errno, "drop_capabilities() failed: %m");
3282 _exit(EXIT_FAILURE);
3285 r = change_uid_gid(&home);
3287 _exit(EXIT_FAILURE);
3289 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3290 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3291 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3293 _exit(EXIT_FAILURE);
3296 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3299 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3301 _exit(EXIT_FAILURE);
3305 if (fdset_size(fds) > 0) {
3306 r = fdset_cloexec(fds, false);
3308 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3309 _exit(EXIT_FAILURE);
3312 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3313 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3315 _exit(EXIT_FAILURE);
3321 if (arg_personality != 0xffffffffLU) {
3322 if (personality(arg_personality) < 0) {
3323 log_error_errno(errno, "personality() failed: %m");
3324 _exit(EXIT_FAILURE);
3326 } else if (secondary) {
3327 if (personality(PER_LINUX32) < 0) {
3328 log_error_errno(errno, "personality() failed: %m");
3329 _exit(EXIT_FAILURE);
3334 if (arg_selinux_context)
3335 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3336 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3337 _exit(EXIT_FAILURE);
3341 if (!strv_isempty(arg_setenv)) {
3344 n = strv_env_merge(2, envp, arg_setenv);
3347 _exit(EXIT_FAILURE);
3352 env_use = (char**) envp;
3354 /* Wait until the parent is ready with the setup, too... */
3355 if (!barrier_place_and_sync(&barrier))
3356 _exit(EXIT_FAILURE);
3362 /* Automatically search for the init system */
3364 l = 1 + argc - optind;
3365 a = newa(char*, l + 1);
3366 memcpy(a + 1, argv + optind, l * sizeof(char*));
3368 a[0] = (char*) "/usr/lib/systemd/systemd";
3369 execve(a[0], a, env_use);
3371 a[0] = (char*) "/lib/systemd/systemd";
3372 execve(a[0], a, env_use);
3374 a[0] = (char*) "/sbin/init";
3375 execve(a[0], a, env_use);
3376 } else if (argc > optind)
3377 execvpe(argv[optind], argv + optind, env_use);
3379 chdir(home ? home : "/root");
3380 execle("/bin/bash", "-bash", NULL, env_use);
3381 execle("/bin/sh", "-sh", NULL, env_use);
3384 log_error_errno(errno, "execv() failed: %m");
3385 _exit(EXIT_FAILURE);
3388 barrier_set_role(&barrier, BARRIER_PARENT);
3392 /* wait for child-setup to be done */
3393 if (barrier_place_and_sync(&barrier)) {
3394 _cleanup_event_unref_ sd_event *event = NULL;
3395 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3398 r = move_network_interfaces(pid);
3402 r = setup_veth(pid, veth_name, &ifi);
3406 r = setup_bridge(veth_name, &ifi);
3410 r = setup_macvlan(pid);
3414 r = register_machine(pid, ifi);
3418 /* Block SIGCHLD here, before notifying child.
3419 * process_pty() will handle it with the other signals. */
3420 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3424 /* Reset signal to default */
3425 r = default_signals(SIGCHLD, -1);
3429 /* Notify the child that the parent is ready with all
3430 * its setup, and that the child can now hand over
3431 * control to the code to run inside the container. */
3432 (void)barrier_place(&barrier);
3434 r = sd_event_new(&event);
3436 log_error_errno(r, "Failed to get default event source: %m");
3441 /* Try to kill the init system on SIGINT or SIGTERM */
3442 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3443 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3445 /* Immediately exit */
3446 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3447 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3450 /* simply exit on sigchld */
3451 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3453 r = pty_forward_new(event, master, &forward);
3455 log_error_errno(r, "Failed to create PTY forwarder: %m");
3459 r = sd_event_loop(event);
3461 log_error_errno(r, "Failed to run event loop: %m");
3465 forward = pty_forward_free(forward);
3470 /* Kill if it is not dead yet anyway */
3471 terminate_machine(pid);
3474 /* Normally redundant, but better safe than sorry */
3477 r = wait_for_container(pid, &container_status);
3481 /* We failed to wait for the container, or the
3482 * container exited abnormally */
3484 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3485 /* The container exited with a non-zero
3486 * status, or with zero status and no reboot
3492 /* CONTAINER_REBOOTED, loop again */
3494 if (arg_keep_unit) {
3495 /* Special handling if we are running as a
3496 * service: instead of simply restarting the
3497 * machine we want to restart the entire
3498 * service, so let's inform systemd about this
3499 * with the special exit code 133. The service
3500 * file uses RestartForceExitStatus=133 so
3501 * that this results in a full nspawn
3502 * restart. This is necessary since we might
3503 * have cgroup parameters set we want to have
3514 "STATUS=Terminating...");
3516 loop_remove(loop_nr, &image_fd);
3521 if (remove_subvol && arg_directory) {
3524 k = btrfs_subvol_remove(arg_directory);
3526 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3529 free(arg_directory);
3534 strv_free(arg_setenv);
3535 strv_free(arg_network_interfaces);
3536 strv_free(arg_network_macvlan);
3537 strv_free(arg_bind);
3538 strv_free(arg_bind_ro);
3539 strv_free(arg_tmpfs);
3541 return r < 0 ? EXIT_FAILURE : ret;