1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
48 #include <selinux/selinux.h>
55 #include "sd-daemon.h"
65 #include "cgroup-util.h"
67 #include "path-util.h"
68 #include "loopback-setup.h"
69 #include "dev-setup.h"
74 #include "bus-error.h"
76 #include "bus-kernel.h"
79 #include "rtnl-util.h"
80 #include "udev-util.h"
82 typedef enum LinkJournal {
89 static char *arg_directory = NULL;
90 static char *arg_user = NULL;
91 static sd_id128_t arg_uuid = {};
92 static char *arg_machine = NULL;
93 static char *arg_selinux_context = NULL;
94 static char *arg_selinux_apifs_context = NULL;
95 static const char *arg_slice = NULL;
96 static bool arg_private_network = false;
97 static bool arg_read_only = false;
98 static bool arg_boot = false;
99 static LinkJournal arg_link_journal = LINK_AUTO;
100 static uint64_t arg_retain =
101 (1ULL << CAP_CHOWN) |
102 (1ULL << CAP_DAC_OVERRIDE) |
103 (1ULL << CAP_DAC_READ_SEARCH) |
104 (1ULL << CAP_FOWNER) |
105 (1ULL << CAP_FSETID) |
106 (1ULL << CAP_IPC_OWNER) |
108 (1ULL << CAP_LEASE) |
109 (1ULL << CAP_LINUX_IMMUTABLE) |
110 (1ULL << CAP_NET_BIND_SERVICE) |
111 (1ULL << CAP_NET_BROADCAST) |
112 (1ULL << CAP_NET_RAW) |
113 (1ULL << CAP_SETGID) |
114 (1ULL << CAP_SETFCAP) |
115 (1ULL << CAP_SETPCAP) |
116 (1ULL << CAP_SETUID) |
117 (1ULL << CAP_SYS_ADMIN) |
118 (1ULL << CAP_SYS_CHROOT) |
119 (1ULL << CAP_SYS_NICE) |
120 (1ULL << CAP_SYS_PTRACE) |
121 (1ULL << CAP_SYS_TTY_CONFIG) |
122 (1ULL << CAP_SYS_RESOURCE) |
123 (1ULL << CAP_SYS_BOOT) |
124 (1ULL << CAP_AUDIT_WRITE) |
125 (1ULL << CAP_AUDIT_CONTROL) |
127 static char **arg_bind = NULL;
128 static char **arg_bind_ro = NULL;
129 static char **arg_setenv = NULL;
130 static bool arg_quiet = false;
131 static bool arg_share_system = false;
132 static bool arg_register = true;
133 static bool arg_keep_unit = false;
134 static char **arg_network_interfaces = NULL;
135 static bool arg_network_veth = false;
137 static int help(void) {
139 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
140 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
141 " -h --help Show this help\n"
142 " --version Print version string\n"
143 " -q --quiet Do not show status information\n"
144 " -D --directory=NAME Root directory for the container\n"
145 " -b --boot Boot up full system (i.e. invoke init)\n"
146 " -u --user=USER Run the command under specified user or uid\n"
147 " -M --machine=NAME Set the machine name for the container\n"
148 " --uuid=UUID Set a specific machine UUID for the container\n"
149 " -S --slice=SLICE Place the container in the specified slice\n"
150 " --private-network Disable network in container\n"
151 " --network-interface=INTERFACE\n"
152 " Assign an existing network interface to the\n"
154 " --network-veth Add a a virtual ethernet connection between host\n"
156 " -Z --selinux-context=SECLABEL\n"
157 " Set the SELinux security context to be used by\n"
158 " processes in the container\n"
159 " -L --selinux-apifs-context=SECLABEL\n"
160 " Set the SELinux security context to be used by\n"
161 " API/tmpfs file systems in the container\n"
162 " --capability=CAP In addition to the default, retain specified\n"
164 " --drop-capability=CAP Drop the specified capability from the default set\n"
165 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
166 " -j Equivalent to --link-journal=host\n"
167 " --read-only Mount the root directory read-only\n"
168 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
170 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
171 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
172 " --share-system Share system namespaces with host\n"
173 " --register=BOOLEAN Register container as machine\n"
174 " --keep-unit Do not register a scope for the machine, reuse\n"
175 " the service unit nspawn is running in\n",
176 program_invocation_short_name);
181 static int parse_argv(int argc, char *argv[]) {
197 ARG_NETWORK_INTERFACE,
201 static const struct option options[] = {
202 { "help", no_argument, NULL, 'h' },
203 { "version", no_argument, NULL, ARG_VERSION },
204 { "directory", required_argument, NULL, 'D' },
205 { "user", required_argument, NULL, 'u' },
206 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
207 { "boot", no_argument, NULL, 'b' },
208 { "uuid", required_argument, NULL, ARG_UUID },
209 { "read-only", no_argument, NULL, ARG_READ_ONLY },
210 { "capability", required_argument, NULL, ARG_CAPABILITY },
211 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
212 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
213 { "bind", required_argument, NULL, ARG_BIND },
214 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
215 { "machine", required_argument, NULL, 'M' },
216 { "slice", required_argument, NULL, 'S' },
217 { "setenv", required_argument, NULL, ARG_SETENV },
218 { "selinux-context", required_argument, NULL, 'Z' },
219 { "selinux-apifs-context", required_argument, NULL, 'L' },
220 { "quiet", no_argument, NULL, 'q' },
221 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
222 { "register", required_argument, NULL, ARG_REGISTER },
223 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
224 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
225 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
230 uint64_t plus = 0, minus = 0;
235 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
243 puts(PACKAGE_STRING);
244 puts(SYSTEMD_FEATURES);
249 arg_directory = canonicalize_file_name(optarg);
250 if (!arg_directory) {
251 log_error("Invalid root directory: %m");
259 arg_user = strdup(optarg);
265 case ARG_NETWORK_VETH:
266 arg_network_veth = true;
267 arg_private_network = true;
270 case ARG_NETWORK_INTERFACE:
271 if (strv_push(&arg_network_interfaces, optarg) < 0)
276 case ARG_PRIVATE_NETWORK:
277 arg_private_network = true;
285 r = sd_id128_from_string(optarg, &arg_uuid);
287 log_error("Invalid UUID: %s", optarg);
293 arg_slice = strdup(optarg);
300 if (isempty(optarg)) {
305 if (!hostname_is_valid(optarg)) {
306 log_error("Invalid machine name: %s", optarg);
311 arg_machine = strdup(optarg);
319 arg_selinux_context = optarg;
323 arg_selinux_apifs_context = optarg;
327 arg_read_only = true;
331 case ARG_DROP_CAPABILITY: {
335 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
336 _cleanup_free_ char *t;
339 t = strndup(word, length);
343 if (streq(t, "all")) {
344 if (c == ARG_CAPABILITY)
345 plus = (uint64_t) -1;
347 minus = (uint64_t) -1;
349 if (cap_from_name(t, &cap) < 0) {
350 log_error("Failed to parse capability %s.", t);
354 if (c == ARG_CAPABILITY)
355 plus |= 1ULL << (uint64_t) cap;
357 minus |= 1ULL << (uint64_t) cap;
365 arg_link_journal = LINK_GUEST;
368 case ARG_LINK_JOURNAL:
369 if (streq(optarg, "auto"))
370 arg_link_journal = LINK_AUTO;
371 else if (streq(optarg, "no"))
372 arg_link_journal = LINK_NO;
373 else if (streq(optarg, "guest"))
374 arg_link_journal = LINK_GUEST;
375 else if (streq(optarg, "host"))
376 arg_link_journal = LINK_HOST;
378 log_error("Failed to parse link journal mode %s", optarg);
386 _cleanup_free_ char *a = NULL, *b = NULL;
390 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
392 e = strchr(optarg, ':');
394 a = strndup(optarg, e - optarg);
404 if (!path_is_absolute(a) || !path_is_absolute(b)) {
405 log_error("Invalid bind mount specification: %s", optarg);
409 r = strv_extend(x, a);
413 r = strv_extend(x, b);
423 if (!env_assignment_is_valid(optarg)) {
424 log_error("Environment variable assignment '%s' is not valid.", optarg);
428 n = strv_env_set(arg_setenv, optarg);
432 strv_free(arg_setenv);
441 case ARG_SHARE_SYSTEM:
442 arg_share_system = true;
446 r = parse_boolean(optarg);
448 log_error("Failed to parse --register= argument: %s", optarg);
456 arg_keep_unit = true;
463 assert_not_reached("Unhandled option");
467 if (arg_share_system)
468 arg_register = false;
470 if (arg_boot && arg_share_system) {
471 log_error("--boot and --share-system may not be combined.");
475 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
476 log_error("--keep-unit may not be used when invoked from a user session.");
480 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
485 static int mount_all(const char *dest) {
487 typedef struct MountPoint {
496 static const MountPoint mount_table[] = {
497 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
498 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
499 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
500 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
501 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
502 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
503 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
504 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
506 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
507 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
514 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
515 _cleanup_free_ char *where = NULL;
517 _cleanup_free_ char *options = NULL;
522 where = strjoin(dest, "/", mount_table[k].where, NULL);
526 t = path_is_mount_point(where, true);
528 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
536 /* Skip this entry if it is not a remount. */
537 if (mount_table[k].what && t > 0)
540 mkdir_p(where, 0755);
543 if (arg_selinux_apifs_context &&
544 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
545 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
552 o = mount_table[k].options;
555 if (mount(mount_table[k].what,
558 mount_table[k].flags,
560 mount_table[k].fatal) {
562 log_error("mount(%s) failed: %m", where);
572 static int mount_binds(const char *dest, char **l, unsigned long flags) {
575 STRV_FOREACH_PAIR(x, y, l) {
577 struct stat source_st, dest_st;
580 if (stat(*x, &source_st) < 0) {
581 log_error("failed to stat %s: %m", *x);
585 where = strappenda(dest, *y);
586 r = stat(where, &dest_st);
588 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
589 log_error("The file types of %s and %s do not match. Refusing bind mount",
593 } else if (errno == ENOENT) {
594 r = mkdir_parents_label(where, 0755);
596 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
600 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
603 /* Create the mount point, but be conservative -- refuse to create block
604 * and char devices. */
605 if (S_ISDIR(source_st.st_mode))
606 mkdir_label(where, 0755);
607 else if (S_ISFIFO(source_st.st_mode))
609 else if (S_ISSOCK(source_st.st_mode))
610 mknod(where, 0644 | S_IFSOCK, 0);
611 else if (S_ISREG(source_st.st_mode))
614 log_error("Refusing to create mountpoint for file: %s", *x);
618 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
619 log_error("mount(%s) failed: %m", where);
623 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
624 log_error("mount(%s) failed: %m", where);
632 static int setup_timezone(const char *dest) {
633 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
639 /* Fix the timezone, if possible */
640 r = readlink_malloc("/etc/localtime", &p);
642 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
646 z = path_startswith(p, "../usr/share/zoneinfo/");
648 z = path_startswith(p, "/usr/share/zoneinfo/");
650 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
654 where = strappend(dest, "/etc/localtime");
658 r = readlink_malloc(where, &q);
660 y = path_startswith(q, "../usr/share/zoneinfo/");
662 y = path_startswith(q, "/usr/share/zoneinfo/");
665 /* Already pointing to the right place? Then do nothing .. */
666 if (y && streq(y, z))
670 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
674 if (access(check, F_OK) < 0) {
675 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
679 what = strappend("../usr/share/zoneinfo/", z);
684 if (symlink(what, where) < 0) {
685 log_error("Failed to correct timezone of container: %m");
692 static int setup_resolv_conf(const char *dest) {
693 char _cleanup_free_ *where = NULL;
697 if (arg_private_network)
700 /* Fix resolv.conf, if possible */
701 where = strappend(dest, "/etc/resolv.conf");
705 /* We don't really care for the results of this really. If it
706 * fails, it fails, but meh... */
707 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
712 static int setup_boot_id(const char *dest) {
713 _cleanup_free_ char *from = NULL, *to = NULL;
720 if (arg_share_system)
723 /* Generate a new randomized boot ID, so that each boot-up of
724 * the container gets a new one */
726 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
727 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
731 r = sd_id128_randomize(&rnd);
733 log_error("Failed to generate random boot id: %s", strerror(-r));
737 snprintf(as_uuid, sizeof(as_uuid),
738 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
739 SD_ID128_FORMAT_VAL(rnd));
740 char_array_0(as_uuid);
742 r = write_string_file(from, as_uuid);
744 log_error("Failed to write boot id: %s", strerror(-r));
748 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
749 log_error("Failed to bind mount boot id: %m");
751 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
752 log_warning("Failed to make boot id read-only: %m");
758 static int copy_devnodes(const char *dest) {
760 static const char devnodes[] =
770 _cleanup_umask_ mode_t u;
776 NULSTR_FOREACH(d, devnodes) {
777 _cleanup_free_ char *from = NULL, *to = NULL;
780 from = strappend("/dev/", d);
781 to = strjoin(dest, "/dev/", d, NULL);
785 if (stat(from, &st) < 0) {
787 if (errno != ENOENT) {
788 log_error("Failed to stat %s: %m", from);
792 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
794 log_error("%s is not a char or block device, cannot copy", from);
797 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
799 log_error("mknod(%s) failed: %m", dest);
807 static int setup_ptmx(const char *dest) {
808 _cleanup_free_ char *p = NULL;
810 p = strappend(dest, "/dev/ptmx");
814 if (symlink("pts/ptmx", p) < 0) {
815 log_error("Failed to create /dev/ptmx symlink: %m");
822 static int setup_dev_console(const char *dest, const char *console) {
824 _cleanup_free_ char *to = NULL;
826 _cleanup_umask_ mode_t u;
833 if (stat(console, &st) < 0) {
834 log_error("Failed to stat %s: %m", console);
837 } else if (!S_ISCHR(st.st_mode)) {
838 log_error("/dev/console is not a char device");
842 r = chmod_and_chown(console, 0600, 0, 0);
844 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
848 if (asprintf(&to, "%s/dev/console", dest) < 0)
851 /* We need to bind mount the right tty to /dev/console since
852 * ptys can only exist on pts file systems. To have something
853 * to bind mount things on we create a device node first, that
854 * has the right major/minor (note that the major minor
855 * doesn't actually matter here, since we mount it over
858 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
859 log_error("mknod() for /dev/console failed: %m");
863 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
864 log_error("Bind mount for /dev/console failed: %m");
871 static int setup_kmsg(const char *dest, int kmsg_socket) {
872 _cleanup_free_ char *from = NULL, *to = NULL;
874 _cleanup_umask_ mode_t u;
876 struct cmsghdr cmsghdr;
877 uint8_t buf[CMSG_SPACE(sizeof(int))];
880 .msg_control = &control,
881 .msg_controllen = sizeof(control),
883 struct cmsghdr *cmsg;
886 assert(kmsg_socket >= 0);
890 /* We create the kmsg FIFO as /dev/kmsg, but immediately
891 * delete it after bind mounting it to /proc/kmsg. While FIFOs
892 * on the reading side behave very similar to /proc/kmsg,
893 * their writing side behaves differently from /dev/kmsg in
894 * that writing blocks when nothing is reading. In order to
895 * avoid any problems with containers deadlocking due to this
896 * we simply make /dev/kmsg unavailable to the container. */
897 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
898 asprintf(&to, "%s/proc/kmsg", dest) < 0)
901 if (mkfifo(from, 0600) < 0) {
902 log_error("mkfifo() for /dev/kmsg failed: %m");
906 r = chmod_and_chown(from, 0600, 0, 0);
908 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
912 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
913 log_error("Bind mount for /proc/kmsg failed: %m");
917 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
919 log_error("Failed to open fifo: %m");
923 cmsg = CMSG_FIRSTHDR(&mh);
924 cmsg->cmsg_level = SOL_SOCKET;
925 cmsg->cmsg_type = SCM_RIGHTS;
926 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
927 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
929 mh.msg_controllen = cmsg->cmsg_len;
931 /* Store away the fd in the socket, so that it stays open as
932 * long as we run the child */
933 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
934 close_nointr_nofail(fd);
937 log_error("Failed to send FIFO fd: %m");
941 /* And now make the FIFO unavailable as /dev/kmsg... */
946 static int setup_hostname(void) {
948 if (arg_share_system)
951 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
957 static int setup_journal(const char *directory) {
958 sd_id128_t machine_id, this_id;
959 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
963 p = strappend(directory, "/etc/machine-id");
967 r = read_one_line_file(p, &b);
968 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
971 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
976 if (isempty(id) && arg_link_journal == LINK_AUTO)
979 /* Verify validity */
980 r = sd_id128_from_string(id, &machine_id);
982 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
986 r = sd_id128_get_machine(&this_id);
988 log_error("Failed to retrieve machine ID: %s", strerror(-r));
992 if (sd_id128_equal(machine_id, this_id)) {
993 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
994 "Host and machine ids are equal (%s): refusing to link journals", id);
995 if (arg_link_journal == LINK_AUTO)
1001 if (arg_link_journal == LINK_NO)
1005 p = strappend("/var/log/journal/", id);
1006 q = strjoin(directory, "/var/log/journal/", id, NULL);
1010 if (path_is_mount_point(p, false) > 0) {
1011 if (arg_link_journal != LINK_AUTO) {
1012 log_error("%s: already a mount point, refusing to use for journal", p);
1019 if (path_is_mount_point(q, false) > 0) {
1020 if (arg_link_journal != LINK_AUTO) {
1021 log_error("%s: already a mount point, refusing to use for journal", q);
1028 r = readlink_and_make_absolute(p, &d);
1030 if ((arg_link_journal == LINK_GUEST ||
1031 arg_link_journal == LINK_AUTO) &&
1034 r = mkdir_p(q, 0755);
1036 log_warning("failed to create directory %s: %m", q);
1040 if (unlink(p) < 0) {
1041 log_error("Failed to remove symlink %s: %m", p);
1044 } else if (r == -EINVAL) {
1046 if (arg_link_journal == LINK_GUEST &&
1049 if (errno == ENOTDIR) {
1050 log_error("%s already exists and is neither a symlink nor a directory", p);
1053 log_error("Failed to remove %s: %m", p);
1057 } else if (r != -ENOENT) {
1058 log_error("readlink(%s) failed: %m", p);
1062 if (arg_link_journal == LINK_GUEST) {
1064 if (symlink(q, p) < 0) {
1065 log_error("Failed to symlink %s to %s: %m", q, p);
1069 r = mkdir_p(q, 0755);
1071 log_warning("failed to create directory %s: %m", q);
1075 if (arg_link_journal == LINK_HOST) {
1076 r = mkdir_p(p, 0755);
1078 log_error("Failed to create %s: %m", p);
1082 } else if (access(p, F_OK) < 0)
1085 if (dir_is_empty(q) == 0) {
1086 log_error("%s not empty.", q);
1090 r = mkdir_p(q, 0755);
1092 log_error("Failed to create %s: %m", q);
1096 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1097 log_error("Failed to bind mount journal from host into guest: %m");
1104 static int setup_kdbus(const char *dest, const char *path) {
1110 p = strappenda(dest, "/dev/kdbus");
1111 if (mkdir(p, 0755) < 0) {
1112 log_error("Failed to create kdbus path: %m");
1116 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1117 log_error("Failed to mount kdbus domain path: %m");
1124 static int drop_capabilities(void) {
1125 return capability_bounding_set_drop(~arg_retain, false);
1128 static int register_machine(pid_t pid) {
1129 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1130 _cleanup_bus_unref_ sd_bus *bus = NULL;
1136 r = sd_bus_default_system(&bus);
1138 log_error("Failed to open system bus: %s", strerror(-r));
1142 if (arg_keep_unit) {
1143 r = sd_bus_call_method(
1145 "org.freedesktop.machine1",
1146 "/org/freedesktop/machine1",
1147 "org.freedesktop.machine1.Manager",
1153 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1157 strempty(arg_directory));
1159 r = sd_bus_call_method(
1161 "org.freedesktop.machine1",
1162 "/org/freedesktop/machine1",
1163 "org.freedesktop.machine1.Manager",
1169 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1173 strempty(arg_directory),
1174 !isempty(arg_slice), "Slice", "s", arg_slice);
1178 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1185 static int terminate_machine(pid_t pid) {
1186 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1187 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1188 _cleanup_bus_unref_ sd_bus *bus = NULL;
1195 r = sd_bus_default_system(&bus);
1197 log_error("Failed to open system bus: %s", strerror(-r));
1201 r = sd_bus_call_method(
1203 "org.freedesktop.machine1",
1204 "/org/freedesktop/machine1",
1205 "org.freedesktop.machine1.Manager",
1212 /* Note that the machine might already have been
1213 * cleaned up automatically, hence don't consider it a
1214 * failure if we cannot get the machine object. */
1215 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1219 r = sd_bus_message_read(reply, "o", &path);
1221 return bus_log_parse_error(r);
1223 r = sd_bus_call_method(
1225 "org.freedesktop.machine1",
1227 "org.freedesktop.machine1.Machine",
1233 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1240 static int reset_audit_loginuid(void) {
1241 _cleanup_free_ char *p = NULL;
1244 if (arg_share_system)
1247 r = read_one_line_file("/proc/self/loginuid", &p);
1251 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1255 /* Already reset? */
1256 if (streq(p, "4294967295"))
1259 r = write_string_file("/proc/self/loginuid", "4294967295");
1261 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1262 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1263 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1264 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1265 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1273 static int setup_veth(int netns_fd) {
1274 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1275 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1276 char iface_name[IFNAMSIZ] = "ve-";
1279 if (!arg_private_network)
1282 if (!arg_network_veth)
1285 strncpy(iface_name+3, arg_machine, sizeof(iface_name) - 3);
1287 r = sd_rtnl_open(0, &rtnl);
1289 log_error("Failed to connect to netlink: %s", strerror(-r));
1293 r = sd_rtnl_message_new_link(RTM_NEWLINK, 0, &m);
1295 log_error("Failed to allocate netlink message: %s", strerror(-r));
1299 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1301 log_error("Failed to append netlink kind: %s", strerror(-r));
1305 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1307 log_error("Failed to open netlink container: %s", strerror(-r));
1311 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1313 log_error("Failed to append netlink kind: %s", strerror(-r));
1317 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1319 log_error("Failed to open netlink container: %s", strerror(-r));
1323 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1325 log_error("z Failed to open netlink container: %s", strerror(-r));
1329 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1331 log_error("Failed to append netlink kind: %s", strerror(-r));
1335 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
1337 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1341 r = sd_rtnl_message_close_container(m);
1343 log_error("Failed to close netlink container: %s", strerror(-r));
1347 r = sd_rtnl_message_close_container(m);
1349 log_error("Failed to close netlink container: %s", strerror(-r));
1353 r = sd_rtnl_message_close_container(m);
1355 log_error("Failed to close netlink container: %s", strerror(-r));
1359 r = sd_rtnl_call(rtnl, m, 0, NULL);
1361 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1368 static int move_network_interfaces(pid_t pid) {
1369 _cleanup_udev_unref_ struct udev *udev = NULL;
1370 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1374 if (!arg_private_network)
1377 if (strv_isempty(arg_network_interfaces))
1380 r = sd_rtnl_open(0, &rtnl);
1382 log_error("Failed to connect to netlink: %s", strerror(-r));
1388 log_error("Failed to connect to udev.");
1392 STRV_FOREACH(i, arg_network_interfaces) {
1393 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1394 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1395 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1398 ifi = (int) if_nametoindex(*i);
1400 log_error("Failed to resolve interface %s: %m", *i);
1404 sprintf(ifi_str, "n%i", ifi);
1405 d = udev_device_new_from_device_id(udev, ifi_str);
1407 log_error("Failed to get udev device for interface %s: %m", *i);
1411 if (udev_device_get_is_initialized(d) <= 0) {
1412 log_error("Network interface %s is not initialized yet.", *i);
1416 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1418 log_error("Failed to allocate netlink message: %s", strerror(-r));
1422 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1424 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1428 r = sd_rtnl_call(rtnl, m, 0, NULL);
1430 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1438 static int audit_still_doesnt_work_in_containers(void) {
1441 scmp_filter_ctx seccomp;
1445 Audit is broken in containers, much of the userspace audit
1446 hookup will fail if running inside a container. We don't
1447 care and just turn off creation of audit sockets.
1449 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1450 with EAFNOSUPPORT which audit userspace uses as indication
1451 that audit is disabled in the kernel.
1454 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1458 r = seccomp_rule_add_exact(
1460 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1463 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1464 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1466 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1470 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1472 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1476 r = seccomp_load(seccomp);
1478 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1481 seccomp_release(seccomp);
1489 int main(int argc, char *argv[]) {
1491 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1, netns_fd = -1;
1492 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1493 _cleanup_free_ char *kdbus_domain = NULL;
1494 _cleanup_fdset_free_ FDSet *fds = NULL;
1495 const char *console = NULL;
1496 int r = EXIT_FAILURE, k;
1501 log_parse_environment();
1504 k = parse_argv(argc, argv);
1512 if (arg_directory) {
1515 p = path_make_absolute_cwd(arg_directory);
1516 free(arg_directory);
1519 arg_directory = get_current_dir_name();
1521 if (!arg_directory) {
1522 log_error("Failed to determine path, please use -D.");
1526 path_kill_slashes(arg_directory);
1529 arg_machine = strdup(basename(arg_directory));
1535 hostname_cleanup(arg_machine, false);
1536 if (isempty(arg_machine)) {
1537 log_error("Failed to determine machine name automatically, please use -M.");
1542 if (geteuid() != 0) {
1543 log_error("Need to be root.");
1547 if (sd_booted() <= 0) {
1548 log_error("Not running on a systemd system.");
1552 if (path_equal(arg_directory, "/")) {
1553 log_error("Spawning container on root directory not supported.");
1558 if (path_is_os_tree(arg_directory) <= 0) {
1559 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1565 p = strappenda(arg_directory,
1566 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
1567 if (access(p, F_OK) < 0) {
1568 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
1575 n_fd_passed = sd_listen_fds(false);
1576 if (n_fd_passed > 0) {
1577 k = fdset_new_listen_fds(&fds, false);
1579 log_error("Failed to collect file descriptors: %s", strerror(-k));
1583 fdset_close_others(fds);
1586 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1588 log_error("Failed to acquire pseudo tty: %m");
1592 console = ptsname(master);
1594 log_error("Failed to determine tty name: %m");
1599 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1601 if (unlockpt(master) < 0) {
1602 log_error("Failed to unlock tty: %m");
1606 if (arg_network_veth) {
1607 netns_fd = open("/proc/self/ns/net", O_RDWR|O_CLOEXEC);
1609 log_error("Failed to open network namespace fd: %m");
1614 if (access("/dev/kdbus/control", F_OK) >= 0) {
1616 if (arg_share_system) {
1617 kdbus_domain = strdup("/dev/kdbus");
1618 if (!kdbus_domain) {
1625 ns = strappenda("machine-", arg_machine);
1626 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1628 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1630 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1634 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1635 log_error("Failed to create kmsg socket pair: %m");
1639 sd_notify(0, "READY=1");
1641 assert_se(sigemptyset(&mask) == 0);
1642 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1643 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1648 sync_fd = eventfd(0, EFD_CLOEXEC);
1650 log_error("Failed to create event fd: %m");
1654 pid = syscall(__NR_clone,
1655 SIGCHLD|CLONE_NEWNS|
1656 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1657 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1659 if (errno == EINVAL)
1660 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1662 log_error("clone() failed: %m");
1669 const char *home = NULL;
1670 uid_t uid = (uid_t) -1;
1671 gid_t gid = (gid_t) -1;
1673 const char *envp[] = {
1674 "PATH=" DEFAULT_PATH_SPLIT_USR,
1675 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1680 NULL, /* container_uuid */
1681 NULL, /* LISTEN_FDS */
1682 NULL, /* LISTEN_PID */
1688 envp[n_env] = strv_find_prefix(environ, "TERM=");
1692 close_nointr_nofail(master);
1695 close_nointr(STDIN_FILENO);
1696 close_nointr(STDOUT_FILENO);
1697 close_nointr(STDERR_FILENO);
1699 close_nointr_nofail(kmsg_socket_pair[0]);
1700 kmsg_socket_pair[0] = -1;
1702 reset_all_signal_handlers();
1704 assert_se(sigemptyset(&mask) == 0);
1705 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1707 k = open_terminal(console, O_RDWR);
1708 if (k != STDIN_FILENO) {
1710 close_nointr_nofail(k);
1714 log_error("Failed to open console: %s", strerror(-k));
1718 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1719 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1720 log_error("Failed to duplicate console: %m");
1725 log_error("setsid() failed: %m");
1729 if (reset_audit_loginuid() < 0)
1732 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1733 log_error("PR_SET_PDEATHSIG failed: %m");
1737 /* Mark everything as slave, so that we still
1738 * receive mounts from the real root, but don't
1739 * propagate mounts to the real root. */
1740 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1741 log_error("MS_SLAVE|MS_REC failed: %m");
1745 /* Turn directory into bind mount */
1746 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1747 log_error("Failed to make bind mount.");
1752 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1753 log_error("Failed to make read-only.");
1757 if (mount_all(arg_directory) < 0)
1760 if (copy_devnodes(arg_directory) < 0)
1763 if (setup_ptmx(arg_directory) < 0)
1766 dev_setup(arg_directory);
1768 if (setup_veth(netns_fd) < 0)
1771 if (netns_fd >= 0) {
1772 close_nointr_nofail(netns_fd);
1776 if (audit_still_doesnt_work_in_containers() < 0)
1779 if (setup_dev_console(arg_directory, console) < 0)
1782 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1785 close_nointr_nofail(kmsg_socket_pair[1]);
1786 kmsg_socket_pair[1] = -1;
1788 if (setup_boot_id(arg_directory) < 0)
1791 if (setup_timezone(arg_directory) < 0)
1794 if (setup_resolv_conf(arg_directory) < 0)
1797 if (setup_journal(arg_directory) < 0)
1800 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1803 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1806 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1809 if (chdir(arg_directory) < 0) {
1810 log_error("chdir(%s) failed: %m", arg_directory);
1814 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1815 log_error("mount(MS_MOVE) failed: %m");
1819 if (chroot(".") < 0) {
1820 log_error("chroot() failed: %m");
1824 if (chdir("/") < 0) {
1825 log_error("chdir() failed: %m");
1831 if (arg_private_network)
1834 if (drop_capabilities() < 0) {
1835 log_error("drop_capabilities() failed: %m");
1841 /* Note that this resolves user names
1842 * inside the container, and hence
1843 * accesses the NSS modules from the
1844 * container and not the host. This is
1847 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1848 log_error("get_user_creds() failed: %m");
1852 if (mkdir_parents_label(home, 0775) < 0) {
1853 log_error("mkdir_parents_label() failed: %m");
1857 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1858 log_error("mkdir_safe_label() failed: %m");
1862 if (initgroups((const char*)arg_user, gid) < 0) {
1863 log_error("initgroups() failed: %m");
1867 if (setresgid(gid, gid, gid) < 0) {
1868 log_error("setregid() failed: %m");
1872 if (setresuid(uid, uid, uid) < 0) {
1873 log_error("setreuid() failed: %m");
1877 /* Reset everything fully to 0, just in case */
1879 if (setgroups(0, NULL) < 0) {
1880 log_error("setgroups() failed: %m");
1884 if (setresgid(0, 0, 0) < 0) {
1885 log_error("setregid() failed: %m");
1889 if (setresuid(0, 0, 0) < 0) {
1890 log_error("setreuid() failed: %m");
1895 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1896 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1897 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1902 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1903 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1909 if (fdset_size(fds) > 0) {
1910 k = fdset_cloexec(fds, false);
1912 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1916 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1917 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1925 eventfd_read(sync_fd, &x);
1926 close_nointr_nofail(sync_fd);
1929 if (!strv_isempty(arg_setenv)) {
1932 n = strv_env_merge(2, envp, arg_setenv);
1940 env_use = (char**) envp;
1943 if (arg_selinux_context)
1944 if (setexeccon(arg_selinux_context) < 0)
1945 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1951 /* Automatically search for the init system */
1953 l = 1 + argc - optind;
1954 a = newa(char*, l + 1);
1955 memcpy(a + 1, argv + optind, l * sizeof(char*));
1957 a[0] = (char*) "/usr/lib/systemd/systemd";
1958 execve(a[0], a, env_use);
1960 a[0] = (char*) "/lib/systemd/systemd";
1961 execve(a[0], a, env_use);
1963 a[0] = (char*) "/sbin/init";
1964 execve(a[0], a, env_use);
1965 } else if (argc > optind)
1966 execvpe(argv[optind], argv + optind, env_use);
1968 chdir(home ? home : "/root");
1969 execle("/bin/bash", "-bash", NULL, env_use);
1970 execle("/bin/sh", "-sh", NULL, env_use);
1973 log_error("execv() failed: %m");
1976 _exit(EXIT_FAILURE);
1982 r = register_machine(pid);
1986 r = move_network_interfaces(pid);
1990 eventfd_write(sync_fd, 1);
1991 close_nointr_nofail(sync_fd);
1994 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
2003 /* Kill if it is not dead yet anyway */
2004 terminate_machine(pid);
2006 /* Redundant, but better safe than sorry */
2009 k = wait_for_terminate(pid, &status);
2017 if (status.si_code == CLD_EXITED) {
2018 r = status.si_status;
2019 if (status.si_status != 0) {
2020 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2025 log_debug("Container %s exited successfully.", arg_machine);
2027 } else if (status.si_code == CLD_KILLED &&
2028 status.si_status == SIGINT) {
2031 log_info("Container %s has been shut down.", arg_machine);
2034 } else if (status.si_code == CLD_KILLED &&
2035 status.si_status == SIGHUP) {
2038 log_info("Container %s is being rebooted.", arg_machine);
2040 } else if (status.si_code == CLD_KILLED ||
2041 status.si_code == CLD_DUMPED) {
2043 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2047 log_error("Container %s failed due to unknown reason.", arg_machine);
2057 free(arg_directory);
2060 free(arg_network_interfaces);