1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
46 #include <linux/veth.h>
49 #include <selinux/selinux.h>
56 #include "sd-daemon.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
77 #include "bus-kernel.h"
80 #include "rtnl-util.h"
81 #include "udev-util.h"
83 typedef enum LinkJournal {
90 static char *arg_directory = NULL;
91 static char *arg_user = NULL;
92 static sd_id128_t arg_uuid = {};
93 static char *arg_machine = NULL;
94 static char *arg_selinux_context = NULL;
95 static char *arg_selinux_apifs_context = NULL;
96 static const char *arg_slice = NULL;
97 static bool arg_private_network = false;
98 static bool arg_read_only = false;
99 static bool arg_boot = false;
100 static LinkJournal arg_link_journal = LINK_AUTO;
101 static uint64_t arg_retain =
102 (1ULL << CAP_CHOWN) |
103 (1ULL << CAP_DAC_OVERRIDE) |
104 (1ULL << CAP_DAC_READ_SEARCH) |
105 (1ULL << CAP_FOWNER) |
106 (1ULL << CAP_FSETID) |
107 (1ULL << CAP_IPC_OWNER) |
109 (1ULL << CAP_LEASE) |
110 (1ULL << CAP_LINUX_IMMUTABLE) |
111 (1ULL << CAP_NET_BIND_SERVICE) |
112 (1ULL << CAP_NET_BROADCAST) |
113 (1ULL << CAP_NET_RAW) |
114 (1ULL << CAP_SETGID) |
115 (1ULL << CAP_SETFCAP) |
116 (1ULL << CAP_SETPCAP) |
117 (1ULL << CAP_SETUID) |
118 (1ULL << CAP_SYS_ADMIN) |
119 (1ULL << CAP_SYS_CHROOT) |
120 (1ULL << CAP_SYS_NICE) |
121 (1ULL << CAP_SYS_PTRACE) |
122 (1ULL << CAP_SYS_TTY_CONFIG) |
123 (1ULL << CAP_SYS_RESOURCE) |
124 (1ULL << CAP_SYS_BOOT) |
125 (1ULL << CAP_AUDIT_WRITE) |
126 (1ULL << CAP_AUDIT_CONTROL) |
128 static char **arg_bind = NULL;
129 static char **arg_bind_ro = NULL;
130 static char **arg_setenv = NULL;
131 static bool arg_quiet = false;
132 static bool arg_share_system = false;
133 static bool arg_register = true;
134 static bool arg_keep_unit = false;
135 static char **arg_network_interfaces = NULL;
136 static bool arg_network_veth = false;
138 static int help(void) {
140 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
141 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
142 " -h --help Show this help\n"
143 " --version Print version string\n"
144 " -q --quiet Do not show status information\n"
145 " -D --directory=NAME Root directory for the container\n"
146 " -b --boot Boot up full system (i.e. invoke init)\n"
147 " -u --user=USER Run the command under specified user or uid\n"
148 " -M --machine=NAME Set the machine name for the container\n"
149 " --uuid=UUID Set a specific machine UUID for the container\n"
150 " -S --slice=SLICE Place the container in the specified slice\n"
151 " --private-network Disable network in container\n"
152 " --network-interface=INTERFACE\n"
153 " Assign an existing network interface to the\n"
155 " --network-veth Add a a virtual ethernet connection between host\n"
157 " -Z --selinux-context=SECLABEL\n"
158 " Set the SELinux security context to be used by\n"
159 " processes in the container\n"
160 " -L --selinux-apifs-context=SECLABEL\n"
161 " Set the SELinux security context to be used by\n"
162 " API/tmpfs file systems in the container\n"
163 " --capability=CAP In addition to the default, retain specified\n"
165 " --drop-capability=CAP Drop the specified capability from the default set\n"
166 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
167 " -j Equivalent to --link-journal=host\n"
168 " --read-only Mount the root directory read-only\n"
169 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
171 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
172 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
173 " --share-system Share system namespaces with host\n"
174 " --register=BOOLEAN Register container as machine\n"
175 " --keep-unit Do not register a scope for the machine, reuse\n"
176 " the service unit nspawn is running in\n",
177 program_invocation_short_name);
182 static int parse_argv(int argc, char *argv[]) {
198 ARG_NETWORK_INTERFACE,
202 static const struct option options[] = {
203 { "help", no_argument, NULL, 'h' },
204 { "version", no_argument, NULL, ARG_VERSION },
205 { "directory", required_argument, NULL, 'D' },
206 { "user", required_argument, NULL, 'u' },
207 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
208 { "boot", no_argument, NULL, 'b' },
209 { "uuid", required_argument, NULL, ARG_UUID },
210 { "read-only", no_argument, NULL, ARG_READ_ONLY },
211 { "capability", required_argument, NULL, ARG_CAPABILITY },
212 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
213 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
214 { "bind", required_argument, NULL, ARG_BIND },
215 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
216 { "machine", required_argument, NULL, 'M' },
217 { "slice", required_argument, NULL, 'S' },
218 { "setenv", required_argument, NULL, ARG_SETENV },
219 { "selinux-context", required_argument, NULL, 'Z' },
220 { "selinux-apifs-context", required_argument, NULL, 'L' },
221 { "quiet", no_argument, NULL, 'q' },
222 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
223 { "register", required_argument, NULL, ARG_REGISTER },
224 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
225 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
226 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
231 uint64_t plus = 0, minus = 0;
236 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
244 puts(PACKAGE_STRING);
245 puts(SYSTEMD_FEATURES);
250 arg_directory = canonicalize_file_name(optarg);
251 if (!arg_directory) {
252 log_error("Invalid root directory: %m");
260 arg_user = strdup(optarg);
266 case ARG_NETWORK_VETH:
267 arg_network_veth = true;
268 arg_private_network = true;
271 case ARG_NETWORK_INTERFACE:
272 if (strv_push(&arg_network_interfaces, optarg) < 0)
277 case ARG_PRIVATE_NETWORK:
278 arg_private_network = true;
286 r = sd_id128_from_string(optarg, &arg_uuid);
288 log_error("Invalid UUID: %s", optarg);
294 arg_slice = strdup(optarg);
301 if (isempty(optarg)) {
306 if (!hostname_is_valid(optarg)) {
307 log_error("Invalid machine name: %s", optarg);
312 arg_machine = strdup(optarg);
320 arg_selinux_context = optarg;
324 arg_selinux_apifs_context = optarg;
328 arg_read_only = true;
332 case ARG_DROP_CAPABILITY: {
336 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
337 _cleanup_free_ char *t;
340 t = strndup(word, length);
344 if (streq(t, "all")) {
345 if (c == ARG_CAPABILITY)
346 plus = (uint64_t) -1;
348 minus = (uint64_t) -1;
350 if (cap_from_name(t, &cap) < 0) {
351 log_error("Failed to parse capability %s.", t);
355 if (c == ARG_CAPABILITY)
356 plus |= 1ULL << (uint64_t) cap;
358 minus |= 1ULL << (uint64_t) cap;
366 arg_link_journal = LINK_GUEST;
369 case ARG_LINK_JOURNAL:
370 if (streq(optarg, "auto"))
371 arg_link_journal = LINK_AUTO;
372 else if (streq(optarg, "no"))
373 arg_link_journal = LINK_NO;
374 else if (streq(optarg, "guest"))
375 arg_link_journal = LINK_GUEST;
376 else if (streq(optarg, "host"))
377 arg_link_journal = LINK_HOST;
379 log_error("Failed to parse link journal mode %s", optarg);
387 _cleanup_free_ char *a = NULL, *b = NULL;
391 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
393 e = strchr(optarg, ':');
395 a = strndup(optarg, e - optarg);
405 if (!path_is_absolute(a) || !path_is_absolute(b)) {
406 log_error("Invalid bind mount specification: %s", optarg);
410 r = strv_extend(x, a);
414 r = strv_extend(x, b);
424 if (!env_assignment_is_valid(optarg)) {
425 log_error("Environment variable assignment '%s' is not valid.", optarg);
429 n = strv_env_set(arg_setenv, optarg);
433 strv_free(arg_setenv);
442 case ARG_SHARE_SYSTEM:
443 arg_share_system = true;
447 r = parse_boolean(optarg);
449 log_error("Failed to parse --register= argument: %s", optarg);
457 arg_keep_unit = true;
464 assert_not_reached("Unhandled option");
468 if (arg_share_system)
469 arg_register = false;
471 if (arg_boot && arg_share_system) {
472 log_error("--boot and --share-system may not be combined.");
476 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
477 log_error("--keep-unit may not be used when invoked from a user session.");
481 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
486 static int mount_all(const char *dest) {
488 typedef struct MountPoint {
497 static const MountPoint mount_table[] = {
498 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
499 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
500 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
501 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
502 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
503 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
504 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
505 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
507 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
508 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
515 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
516 _cleanup_free_ char *where = NULL;
518 _cleanup_free_ char *options = NULL;
523 where = strjoin(dest, "/", mount_table[k].where, NULL);
527 t = path_is_mount_point(where, true);
529 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
537 /* Skip this entry if it is not a remount. */
538 if (mount_table[k].what && t > 0)
541 mkdir_p(where, 0755);
544 if (arg_selinux_apifs_context &&
545 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
546 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
553 o = mount_table[k].options;
556 if (mount(mount_table[k].what,
559 mount_table[k].flags,
561 mount_table[k].fatal) {
563 log_error("mount(%s) failed: %m", where);
573 static int mount_binds(const char *dest, char **l, unsigned long flags) {
576 STRV_FOREACH_PAIR(x, y, l) {
578 struct stat source_st, dest_st;
581 if (stat(*x, &source_st) < 0) {
582 log_error("failed to stat %s: %m", *x);
586 where = strappenda(dest, *y);
587 r = stat(where, &dest_st);
589 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
590 log_error("The file types of %s and %s do not match. Refusing bind mount",
594 } else if (errno == ENOENT) {
595 r = mkdir_parents_label(where, 0755);
597 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
601 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
604 /* Create the mount point, but be conservative -- refuse to create block
605 * and char devices. */
606 if (S_ISDIR(source_st.st_mode))
607 mkdir_label(where, 0755);
608 else if (S_ISFIFO(source_st.st_mode))
610 else if (S_ISSOCK(source_st.st_mode))
611 mknod(where, 0644 | S_IFSOCK, 0);
612 else if (S_ISREG(source_st.st_mode))
615 log_error("Refusing to create mountpoint for file: %s", *x);
619 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
620 log_error("mount(%s) failed: %m", where);
624 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
625 log_error("mount(%s) failed: %m", where);
633 static int setup_timezone(const char *dest) {
634 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
640 /* Fix the timezone, if possible */
641 r = readlink_malloc("/etc/localtime", &p);
643 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
647 z = path_startswith(p, "../usr/share/zoneinfo/");
649 z = path_startswith(p, "/usr/share/zoneinfo/");
651 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
655 where = strappend(dest, "/etc/localtime");
659 r = readlink_malloc(where, &q);
661 y = path_startswith(q, "../usr/share/zoneinfo/");
663 y = path_startswith(q, "/usr/share/zoneinfo/");
666 /* Already pointing to the right place? Then do nothing .. */
667 if (y && streq(y, z))
671 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
675 if (access(check, F_OK) < 0) {
676 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
680 what = strappend("../usr/share/zoneinfo/", z);
685 if (symlink(what, where) < 0) {
686 log_error("Failed to correct timezone of container: %m");
693 static int setup_resolv_conf(const char *dest) {
694 char _cleanup_free_ *where = NULL;
698 if (arg_private_network)
701 /* Fix resolv.conf, if possible */
702 where = strappend(dest, "/etc/resolv.conf");
706 /* We don't really care for the results of this really. If it
707 * fails, it fails, but meh... */
708 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
713 static int setup_boot_id(const char *dest) {
714 _cleanup_free_ char *from = NULL, *to = NULL;
721 if (arg_share_system)
724 /* Generate a new randomized boot ID, so that each boot-up of
725 * the container gets a new one */
727 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
728 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
732 r = sd_id128_randomize(&rnd);
734 log_error("Failed to generate random boot id: %s", strerror(-r));
738 snprintf(as_uuid, sizeof(as_uuid),
739 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
740 SD_ID128_FORMAT_VAL(rnd));
741 char_array_0(as_uuid);
743 r = write_string_file(from, as_uuid);
745 log_error("Failed to write boot id: %s", strerror(-r));
749 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
750 log_error("Failed to bind mount boot id: %m");
752 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
753 log_warning("Failed to make boot id read-only: %m");
759 static int copy_devnodes(const char *dest) {
761 static const char devnodes[] =
771 _cleanup_umask_ mode_t u;
777 NULSTR_FOREACH(d, devnodes) {
778 _cleanup_free_ char *from = NULL, *to = NULL;
781 from = strappend("/dev/", d);
782 to = strjoin(dest, "/dev/", d, NULL);
786 if (stat(from, &st) < 0) {
788 if (errno != ENOENT) {
789 log_error("Failed to stat %s: %m", from);
793 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
795 log_error("%s is not a char or block device, cannot copy", from);
798 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
800 log_error("mknod(%s) failed: %m", dest);
808 static int setup_ptmx(const char *dest) {
809 _cleanup_free_ char *p = NULL;
811 p = strappend(dest, "/dev/ptmx");
815 if (symlink("pts/ptmx", p) < 0) {
816 log_error("Failed to create /dev/ptmx symlink: %m");
823 static int setup_dev_console(const char *dest, const char *console) {
825 _cleanup_free_ char *to = NULL;
827 _cleanup_umask_ mode_t u;
834 if (stat(console, &st) < 0) {
835 log_error("Failed to stat %s: %m", console);
838 } else if (!S_ISCHR(st.st_mode)) {
839 log_error("/dev/console is not a char device");
843 r = chmod_and_chown(console, 0600, 0, 0);
845 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
849 if (asprintf(&to, "%s/dev/console", dest) < 0)
852 /* We need to bind mount the right tty to /dev/console since
853 * ptys can only exist on pts file systems. To have something
854 * to bind mount things on we create a device node first, that
855 * has the right major/minor (note that the major minor
856 * doesn't actually matter here, since we mount it over
859 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
860 log_error("mknod() for /dev/console failed: %m");
864 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
865 log_error("Bind mount for /dev/console failed: %m");
872 static int setup_kmsg(const char *dest, int kmsg_socket) {
873 _cleanup_free_ char *from = NULL, *to = NULL;
875 _cleanup_umask_ mode_t u;
877 struct cmsghdr cmsghdr;
878 uint8_t buf[CMSG_SPACE(sizeof(int))];
881 .msg_control = &control,
882 .msg_controllen = sizeof(control),
884 struct cmsghdr *cmsg;
887 assert(kmsg_socket >= 0);
891 /* We create the kmsg FIFO as /dev/kmsg, but immediately
892 * delete it after bind mounting it to /proc/kmsg. While FIFOs
893 * on the reading side behave very similar to /proc/kmsg,
894 * their writing side behaves differently from /dev/kmsg in
895 * that writing blocks when nothing is reading. In order to
896 * avoid any problems with containers deadlocking due to this
897 * we simply make /dev/kmsg unavailable to the container. */
898 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
899 asprintf(&to, "%s/proc/kmsg", dest) < 0)
902 if (mkfifo(from, 0600) < 0) {
903 log_error("mkfifo() for /dev/kmsg failed: %m");
907 r = chmod_and_chown(from, 0600, 0, 0);
909 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
913 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
914 log_error("Bind mount for /proc/kmsg failed: %m");
918 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
920 log_error("Failed to open fifo: %m");
924 cmsg = CMSG_FIRSTHDR(&mh);
925 cmsg->cmsg_level = SOL_SOCKET;
926 cmsg->cmsg_type = SCM_RIGHTS;
927 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
928 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
930 mh.msg_controllen = cmsg->cmsg_len;
932 /* Store away the fd in the socket, so that it stays open as
933 * long as we run the child */
934 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
935 close_nointr_nofail(fd);
938 log_error("Failed to send FIFO fd: %m");
942 /* And now make the FIFO unavailable as /dev/kmsg... */
947 static int setup_hostname(void) {
949 if (arg_share_system)
952 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
958 static int setup_journal(const char *directory) {
959 sd_id128_t machine_id, this_id;
960 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
964 p = strappend(directory, "/etc/machine-id");
968 r = read_one_line_file(p, &b);
969 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
972 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
977 if (isempty(id) && arg_link_journal == LINK_AUTO)
980 /* Verify validity */
981 r = sd_id128_from_string(id, &machine_id);
983 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
987 r = sd_id128_get_machine(&this_id);
989 log_error("Failed to retrieve machine ID: %s", strerror(-r));
993 if (sd_id128_equal(machine_id, this_id)) {
994 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
995 "Host and machine ids are equal (%s): refusing to link journals", id);
996 if (arg_link_journal == LINK_AUTO)
1002 if (arg_link_journal == LINK_NO)
1006 p = strappend("/var/log/journal/", id);
1007 q = strjoin(directory, "/var/log/journal/", id, NULL);
1011 if (path_is_mount_point(p, false) > 0) {
1012 if (arg_link_journal != LINK_AUTO) {
1013 log_error("%s: already a mount point, refusing to use for journal", p);
1020 if (path_is_mount_point(q, false) > 0) {
1021 if (arg_link_journal != LINK_AUTO) {
1022 log_error("%s: already a mount point, refusing to use for journal", q);
1029 r = readlink_and_make_absolute(p, &d);
1031 if ((arg_link_journal == LINK_GUEST ||
1032 arg_link_journal == LINK_AUTO) &&
1035 r = mkdir_p(q, 0755);
1037 log_warning("failed to create directory %s: %m", q);
1041 if (unlink(p) < 0) {
1042 log_error("Failed to remove symlink %s: %m", p);
1045 } else if (r == -EINVAL) {
1047 if (arg_link_journal == LINK_GUEST &&
1050 if (errno == ENOTDIR) {
1051 log_error("%s already exists and is neither a symlink nor a directory", p);
1054 log_error("Failed to remove %s: %m", p);
1058 } else if (r != -ENOENT) {
1059 log_error("readlink(%s) failed: %m", p);
1063 if (arg_link_journal == LINK_GUEST) {
1065 if (symlink(q, p) < 0) {
1066 log_error("Failed to symlink %s to %s: %m", q, p);
1070 r = mkdir_p(q, 0755);
1072 log_warning("failed to create directory %s: %m", q);
1076 if (arg_link_journal == LINK_HOST) {
1077 r = mkdir_p(p, 0755);
1079 log_error("Failed to create %s: %m", p);
1083 } else if (access(p, F_OK) < 0)
1086 if (dir_is_empty(q) == 0) {
1087 log_error("%s not empty.", q);
1091 r = mkdir_p(q, 0755);
1093 log_error("Failed to create %s: %m", q);
1097 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1098 log_error("Failed to bind mount journal from host into guest: %m");
1105 static int setup_kdbus(const char *dest, const char *path) {
1111 p = strappenda(dest, "/dev/kdbus");
1112 if (mkdir(p, 0755) < 0) {
1113 log_error("Failed to create kdbus path: %m");
1117 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1118 log_error("Failed to mount kdbus domain path: %m");
1125 static int drop_capabilities(void) {
1126 return capability_bounding_set_drop(~arg_retain, false);
1129 static int register_machine(pid_t pid) {
1130 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1131 _cleanup_bus_unref_ sd_bus *bus = NULL;
1137 r = sd_bus_default_system(&bus);
1139 log_error("Failed to open system bus: %s", strerror(-r));
1143 if (arg_keep_unit) {
1144 r = sd_bus_call_method(
1146 "org.freedesktop.machine1",
1147 "/org/freedesktop/machine1",
1148 "org.freedesktop.machine1.Manager",
1154 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1158 strempty(arg_directory));
1160 r = sd_bus_call_method(
1162 "org.freedesktop.machine1",
1163 "/org/freedesktop/machine1",
1164 "org.freedesktop.machine1.Manager",
1170 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1174 strempty(arg_directory),
1175 !isempty(arg_slice), "Slice", "s", arg_slice);
1179 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1186 static int terminate_machine(pid_t pid) {
1187 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1188 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1189 _cleanup_bus_unref_ sd_bus *bus = NULL;
1196 r = sd_bus_default_system(&bus);
1198 log_error("Failed to open system bus: %s", strerror(-r));
1202 r = sd_bus_call_method(
1204 "org.freedesktop.machine1",
1205 "/org/freedesktop/machine1",
1206 "org.freedesktop.machine1.Manager",
1213 /* Note that the machine might already have been
1214 * cleaned up automatically, hence don't consider it a
1215 * failure if we cannot get the machine object. */
1216 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1220 r = sd_bus_message_read(reply, "o", &path);
1222 return bus_log_parse_error(r);
1224 r = sd_bus_call_method(
1226 "org.freedesktop.machine1",
1228 "org.freedesktop.machine1.Machine",
1234 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1241 static int reset_audit_loginuid(void) {
1242 _cleanup_free_ char *p = NULL;
1245 if (arg_share_system)
1248 r = read_one_line_file("/proc/self/loginuid", &p);
1252 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1256 /* Already reset? */
1257 if (streq(p, "4294967295"))
1260 r = write_string_file("/proc/self/loginuid", "4294967295");
1262 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1263 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1264 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1265 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1266 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1274 static int setup_veth(int netns_fd) {
1275 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1276 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1277 char iface_name[IFNAMSIZ] = "ve-";
1280 if (!arg_private_network)
1283 if (!arg_network_veth)
1286 strncpy(iface_name+3, arg_machine, sizeof(iface_name) - 3);
1288 r = sd_rtnl_open(0, &rtnl);
1290 log_error("Failed to connect to netlink: %s", strerror(-r));
1294 r = sd_rtnl_message_new_link(RTM_NEWLINK, 0, &m);
1296 log_error("Failed to allocate netlink message: %s", strerror(-r));
1300 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1302 log_error("Failed to append netlink kind: %s", strerror(-r));
1306 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1308 log_error("Failed to open netlink container: %s", strerror(-r));
1312 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1314 log_error("Failed to append netlink kind: %s", strerror(-r));
1318 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1320 log_error("Failed to open netlink container: %s", strerror(-r));
1324 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1326 log_error("z Failed to open netlink container: %s", strerror(-r));
1330 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1332 log_error("Failed to append netlink kind: %s", strerror(-r));
1336 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
1338 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1342 r = sd_rtnl_message_close_container(m);
1344 log_error("Failed to close netlink container: %s", strerror(-r));
1348 r = sd_rtnl_message_close_container(m);
1350 log_error("Failed to close netlink container: %s", strerror(-r));
1354 r = sd_rtnl_message_close_container(m);
1356 log_error("Failed to close netlink container: %s", strerror(-r));
1360 r = sd_rtnl_call(rtnl, m, 0, NULL);
1362 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1369 static int move_network_interfaces(pid_t pid) {
1370 _cleanup_udev_unref_ struct udev *udev = NULL;
1371 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1375 if (!arg_private_network)
1378 if (strv_isempty(arg_network_interfaces))
1381 r = sd_rtnl_open(0, &rtnl);
1383 log_error("Failed to connect to netlink: %s", strerror(-r));
1389 log_error("Failed to connect to udev.");
1393 STRV_FOREACH(i, arg_network_interfaces) {
1394 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1395 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1396 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1399 ifi = (int) if_nametoindex(*i);
1401 log_error("Failed to resolve interface %s: %m", *i);
1405 sprintf(ifi_str, "n%i", ifi);
1406 d = udev_device_new_from_device_id(udev, ifi_str);
1408 log_error("Failed to get udev device for interface %s: %m", *i);
1412 if (udev_device_get_is_initialized(d) <= 0) {
1413 log_error("Network interface %s is not initialized yet.", *i);
1417 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1419 log_error("Failed to allocate netlink message: %s", strerror(-r));
1423 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1425 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1429 r = sd_rtnl_call(rtnl, m, 0, NULL);
1431 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1439 static int audit_still_doesnt_work_in_containers(void) {
1442 scmp_filter_ctx seccomp;
1446 Audit is broken in containers, much of the userspace audit
1447 hookup will fail if running inside a container. We don't
1448 care and just turn off creation of audit sockets.
1450 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1451 with EAFNOSUPPORT which audit userspace uses as indication
1452 that audit is disabled in the kernel.
1455 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1459 r = seccomp_rule_add_exact(
1461 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1464 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1465 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1467 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1471 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1473 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1477 r = seccomp_load(seccomp);
1479 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1482 seccomp_release(seccomp);
1490 int main(int argc, char *argv[]) {
1492 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1, netns_fd = -1;
1493 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1494 _cleanup_free_ char *kdbus_domain = NULL;
1495 _cleanup_fdset_free_ FDSet *fds = NULL;
1496 const char *console = NULL;
1497 int r = EXIT_FAILURE, k;
1502 log_parse_environment();
1505 k = parse_argv(argc, argv);
1513 if (arg_directory) {
1516 p = path_make_absolute_cwd(arg_directory);
1517 free(arg_directory);
1520 arg_directory = get_current_dir_name();
1522 if (!arg_directory) {
1523 log_error("Failed to determine path, please use -D.");
1527 path_kill_slashes(arg_directory);
1530 arg_machine = strdup(basename(arg_directory));
1536 hostname_cleanup(arg_machine, false);
1537 if (isempty(arg_machine)) {
1538 log_error("Failed to determine machine name automatically, please use -M.");
1543 if (geteuid() != 0) {
1544 log_error("Need to be root.");
1548 if (sd_booted() <= 0) {
1549 log_error("Not running on a systemd system.");
1553 if (path_equal(arg_directory, "/")) {
1554 log_error("Spawning container on root directory not supported.");
1559 if (path_is_os_tree(arg_directory) <= 0) {
1560 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1566 p = strappenda(arg_directory,
1567 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
1568 if (access(p, F_OK) < 0) {
1569 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
1576 n_fd_passed = sd_listen_fds(false);
1577 if (n_fd_passed > 0) {
1578 k = fdset_new_listen_fds(&fds, false);
1580 log_error("Failed to collect file descriptors: %s", strerror(-k));
1584 fdset_close_others(fds);
1587 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1589 log_error("Failed to acquire pseudo tty: %m");
1593 console = ptsname(master);
1595 log_error("Failed to determine tty name: %m");
1600 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1602 if (unlockpt(master) < 0) {
1603 log_error("Failed to unlock tty: %m");
1607 if (arg_network_veth) {
1608 netns_fd = open("/proc/self/ns/net", O_RDWR|O_CLOEXEC);
1610 log_error("Failed to open network namespace fd: %m");
1615 if (access("/dev/kdbus/control", F_OK) >= 0) {
1617 if (arg_share_system) {
1618 kdbus_domain = strdup("/dev/kdbus");
1619 if (!kdbus_domain) {
1626 ns = strappenda("machine-", arg_machine);
1627 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1629 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1631 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1635 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1636 log_error("Failed to create kmsg socket pair: %m");
1640 sd_notify(0, "READY=1");
1642 assert_se(sigemptyset(&mask) == 0);
1643 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1644 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1649 sync_fd = eventfd(0, EFD_CLOEXEC);
1651 log_error("Failed to create event fd: %m");
1655 pid = syscall(__NR_clone,
1656 SIGCHLD|CLONE_NEWNS|
1657 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1658 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1660 if (errno == EINVAL)
1661 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1663 log_error("clone() failed: %m");
1670 const char *home = NULL;
1671 uid_t uid = (uid_t) -1;
1672 gid_t gid = (gid_t) -1;
1674 const char *envp[] = {
1675 "PATH=" DEFAULT_PATH_SPLIT_USR,
1676 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1681 NULL, /* container_uuid */
1682 NULL, /* LISTEN_FDS */
1683 NULL, /* LISTEN_PID */
1689 envp[n_env] = strv_find_prefix(environ, "TERM=");
1693 close_nointr_nofail(master);
1696 close_nointr(STDIN_FILENO);
1697 close_nointr(STDOUT_FILENO);
1698 close_nointr(STDERR_FILENO);
1700 close_nointr_nofail(kmsg_socket_pair[0]);
1701 kmsg_socket_pair[0] = -1;
1703 reset_all_signal_handlers();
1705 assert_se(sigemptyset(&mask) == 0);
1706 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1708 k = open_terminal(console, O_RDWR);
1709 if (k != STDIN_FILENO) {
1711 close_nointr_nofail(k);
1715 log_error("Failed to open console: %s", strerror(-k));
1719 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1720 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1721 log_error("Failed to duplicate console: %m");
1726 log_error("setsid() failed: %m");
1730 if (reset_audit_loginuid() < 0)
1733 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1734 log_error("PR_SET_PDEATHSIG failed: %m");
1738 /* Mark everything as slave, so that we still
1739 * receive mounts from the real root, but don't
1740 * propagate mounts to the real root. */
1741 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1742 log_error("MS_SLAVE|MS_REC failed: %m");
1746 /* Turn directory into bind mount */
1747 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1748 log_error("Failed to make bind mount.");
1753 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1754 log_error("Failed to make read-only.");
1758 if (mount_all(arg_directory) < 0)
1761 if (copy_devnodes(arg_directory) < 0)
1764 if (setup_ptmx(arg_directory) < 0)
1767 dev_setup(arg_directory);
1769 if (setup_veth(netns_fd) < 0)
1772 if (netns_fd >= 0) {
1773 close_nointr_nofail(netns_fd);
1777 if (audit_still_doesnt_work_in_containers() < 0)
1780 if (setup_dev_console(arg_directory, console) < 0)
1783 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1786 close_nointr_nofail(kmsg_socket_pair[1]);
1787 kmsg_socket_pair[1] = -1;
1789 if (setup_boot_id(arg_directory) < 0)
1792 if (setup_timezone(arg_directory) < 0)
1795 if (setup_resolv_conf(arg_directory) < 0)
1798 if (setup_journal(arg_directory) < 0)
1801 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1804 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1807 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1810 if (chdir(arg_directory) < 0) {
1811 log_error("chdir(%s) failed: %m", arg_directory);
1815 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1816 log_error("mount(MS_MOVE) failed: %m");
1820 if (chroot(".") < 0) {
1821 log_error("chroot() failed: %m");
1825 if (chdir("/") < 0) {
1826 log_error("chdir() failed: %m");
1832 if (arg_private_network)
1835 if (drop_capabilities() < 0) {
1836 log_error("drop_capabilities() failed: %m");
1842 /* Note that this resolves user names
1843 * inside the container, and hence
1844 * accesses the NSS modules from the
1845 * container and not the host. This is
1848 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1849 log_error("get_user_creds() failed: %m");
1853 if (mkdir_parents_label(home, 0775) < 0) {
1854 log_error("mkdir_parents_label() failed: %m");
1858 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1859 log_error("mkdir_safe_label() failed: %m");
1863 if (initgroups((const char*)arg_user, gid) < 0) {
1864 log_error("initgroups() failed: %m");
1868 if (setresgid(gid, gid, gid) < 0) {
1869 log_error("setregid() failed: %m");
1873 if (setresuid(uid, uid, uid) < 0) {
1874 log_error("setreuid() failed: %m");
1878 /* Reset everything fully to 0, just in case */
1880 if (setgroups(0, NULL) < 0) {
1881 log_error("setgroups() failed: %m");
1885 if (setresgid(0, 0, 0) < 0) {
1886 log_error("setregid() failed: %m");
1890 if (setresuid(0, 0, 0) < 0) {
1891 log_error("setreuid() failed: %m");
1896 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1897 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1898 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1903 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1904 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1910 if (fdset_size(fds) > 0) {
1911 k = fdset_cloexec(fds, false);
1913 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1917 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1918 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1926 eventfd_read(sync_fd, &x);
1927 close_nointr_nofail(sync_fd);
1930 if (!strv_isempty(arg_setenv)) {
1933 n = strv_env_merge(2, envp, arg_setenv);
1941 env_use = (char**) envp;
1944 if (arg_selinux_context)
1945 if (setexeccon(arg_selinux_context) < 0)
1946 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1952 /* Automatically search for the init system */
1954 l = 1 + argc - optind;
1955 a = newa(char*, l + 1);
1956 memcpy(a + 1, argv + optind, l * sizeof(char*));
1958 a[0] = (char*) "/usr/lib/systemd/systemd";
1959 execve(a[0], a, env_use);
1961 a[0] = (char*) "/lib/systemd/systemd";
1962 execve(a[0], a, env_use);
1964 a[0] = (char*) "/sbin/init";
1965 execve(a[0], a, env_use);
1966 } else if (argc > optind)
1967 execvpe(argv[optind], argv + optind, env_use);
1969 chdir(home ? home : "/root");
1970 execle("/bin/bash", "-bash", NULL, env_use);
1971 execle("/bin/sh", "-sh", NULL, env_use);
1974 log_error("execv() failed: %m");
1977 _exit(EXIT_FAILURE);
1983 r = register_machine(pid);
1987 r = move_network_interfaces(pid);
1991 eventfd_write(sync_fd, 1);
1992 close_nointr_nofail(sync_fd);
1995 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
2004 /* Kill if it is not dead yet anyway */
2005 terminate_machine(pid);
2007 /* Redundant, but better safe than sorry */
2010 k = wait_for_terminate(pid, &status);
2018 if (status.si_code == CLD_EXITED) {
2019 r = status.si_status;
2020 if (status.si_status != 0) {
2021 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2026 log_debug("Container %s exited successfully.", arg_machine);
2028 } else if (status.si_code == CLD_KILLED &&
2029 status.si_status == SIGINT) {
2032 log_info("Container %s has been shut down.", arg_machine);
2035 } else if (status.si_code == CLD_KILLED &&
2036 status.si_status == SIGHUP) {
2039 log_info("Container %s is being rebooted.", arg_machine);
2041 } else if (status.si_code == CLD_KILLED ||
2042 status.si_code == CLD_DUMPED) {
2044 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2048 log_error("Container %s failed due to unknown reason.", arg_machine);
2058 free(arg_directory);
2061 free(arg_network_interfaces);