1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
46 #include <linux/veth.h>
49 #include <selinux/selinux.h>
52 #include "sd-daemon.h"
62 #include "cgroup-util.h"
64 #include "path-util.h"
65 #include "loopback-setup.h"
66 #include "dev-setup.h"
71 #include "bus-error.h"
73 #include "bus-kernel.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
79 typedef enum LinkJournal {
86 static char *arg_directory = NULL;
87 static char *arg_user = NULL;
88 static sd_id128_t arg_uuid = {};
89 static char *arg_machine = NULL;
90 static char *arg_selinux_context = NULL;
91 static char *arg_selinux_apifs_context = NULL;
92 static const char *arg_slice = NULL;
93 static bool arg_private_network = false;
94 static bool arg_read_only = false;
95 static bool arg_boot = false;
96 static LinkJournal arg_link_journal = LINK_AUTO;
97 static uint64_t arg_retain =
99 (1ULL << CAP_DAC_OVERRIDE) |
100 (1ULL << CAP_DAC_READ_SEARCH) |
101 (1ULL << CAP_FOWNER) |
102 (1ULL << CAP_FSETID) |
103 (1ULL << CAP_IPC_OWNER) |
105 (1ULL << CAP_LEASE) |
106 (1ULL << CAP_LINUX_IMMUTABLE) |
107 (1ULL << CAP_NET_BIND_SERVICE) |
108 (1ULL << CAP_NET_BROADCAST) |
109 (1ULL << CAP_NET_RAW) |
110 (1ULL << CAP_SETGID) |
111 (1ULL << CAP_SETFCAP) |
112 (1ULL << CAP_SETPCAP) |
113 (1ULL << CAP_SETUID) |
114 (1ULL << CAP_SYS_ADMIN) |
115 (1ULL << CAP_SYS_CHROOT) |
116 (1ULL << CAP_SYS_NICE) |
117 (1ULL << CAP_SYS_PTRACE) |
118 (1ULL << CAP_SYS_TTY_CONFIG) |
119 (1ULL << CAP_SYS_RESOURCE) |
120 (1ULL << CAP_SYS_BOOT) |
121 (1ULL << CAP_AUDIT_WRITE) |
122 (1ULL << CAP_AUDIT_CONTROL) |
124 static char **arg_bind = NULL;
125 static char **arg_bind_ro = NULL;
126 static char **arg_setenv = NULL;
127 static bool arg_quiet = false;
128 static bool arg_share_system = false;
129 static bool arg_register = true;
130 static bool arg_keep_unit = false;
131 static char **arg_network_interfaces = NULL;
132 static bool arg_network_veth = false;
134 static int help(void) {
136 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
137 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
138 " -h --help Show this help\n"
139 " --version Print version string\n"
140 " -q --quiet Do not show status information\n"
141 " -D --directory=NAME Root directory for the container\n"
142 " -b --boot Boot up full system (i.e. invoke init)\n"
143 " -u --user=USER Run the command under specified user or uid\n"
144 " -M --machine=NAME Set the machine name for the container\n"
145 " --uuid=UUID Set a specific machine UUID for the container\n"
146 " -S --slice=SLICE Place the container in the specified slice\n"
147 " --private-network Disable network in container\n"
148 " --network-interface=INTERFACE\n"
149 " Assign an existing network interface to the\n"
151 " --network-veth Add a a virtual ethernet connection between host\n"
153 " -Z --selinux-context=SECLABEL\n"
154 " Set the SELinux security context to be used by\n"
155 " processes in the container\n"
156 " -L --selinux-apifs-context=SECLABEL\n"
157 " Set the SELinux security context to be used by\n"
158 " API/tmpfs file systems in the container\n"
159 " --capability=CAP In addition to the default, retain specified\n"
161 " --drop-capability=CAP Drop the specified capability from the default set\n"
162 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
163 " -j Equivalent to --link-journal=host\n"
164 " --read-only Mount the root directory read-only\n"
165 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
167 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
168 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
169 " --share-system Share system namespaces with host\n"
170 " --register=BOOLEAN Register container as machine\n"
171 " --keep-unit Do not register a scope for the machine, reuse\n"
172 " the service unit nspawn is running in\n",
173 program_invocation_short_name);
178 static int parse_argv(int argc, char *argv[]) {
194 ARG_NETWORK_INTERFACE,
198 static const struct option options[] = {
199 { "help", no_argument, NULL, 'h' },
200 { "version", no_argument, NULL, ARG_VERSION },
201 { "directory", required_argument, NULL, 'D' },
202 { "user", required_argument, NULL, 'u' },
203 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
204 { "boot", no_argument, NULL, 'b' },
205 { "uuid", required_argument, NULL, ARG_UUID },
206 { "read-only", no_argument, NULL, ARG_READ_ONLY },
207 { "capability", required_argument, NULL, ARG_CAPABILITY },
208 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
209 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
210 { "bind", required_argument, NULL, ARG_BIND },
211 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
212 { "machine", required_argument, NULL, 'M' },
213 { "slice", required_argument, NULL, 'S' },
214 { "setenv", required_argument, NULL, ARG_SETENV },
215 { "selinux-context", required_argument, NULL, 'Z' },
216 { "selinux-apifs-context", required_argument, NULL, 'L' },
217 { "quiet", no_argument, NULL, 'q' },
218 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
219 { "register", required_argument, NULL, ARG_REGISTER },
220 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
221 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
222 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
227 uint64_t plus = 0, minus = 0;
232 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
240 puts(PACKAGE_STRING);
241 puts(SYSTEMD_FEATURES);
246 arg_directory = canonicalize_file_name(optarg);
247 if (!arg_directory) {
248 log_error("Invalid root directory: %m");
256 arg_user = strdup(optarg);
262 case ARG_NETWORK_VETH:
263 arg_network_veth = true;
264 arg_private_network = true;
267 case ARG_NETWORK_INTERFACE:
268 if (strv_push(&arg_network_interfaces, optarg) < 0)
273 case ARG_PRIVATE_NETWORK:
274 arg_private_network = true;
282 r = sd_id128_from_string(optarg, &arg_uuid);
284 log_error("Invalid UUID: %s", optarg);
290 arg_slice = strdup(optarg);
297 if (isempty(optarg)) {
302 if (!hostname_is_valid(optarg)) {
303 log_error("Invalid machine name: %s", optarg);
308 arg_machine = strdup(optarg);
316 arg_selinux_context = optarg;
320 arg_selinux_apifs_context = optarg;
324 arg_read_only = true;
328 case ARG_DROP_CAPABILITY: {
332 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
333 _cleanup_free_ char *t;
336 t = strndup(word, length);
340 if (streq(t, "all")) {
341 if (c == ARG_CAPABILITY)
342 plus = (uint64_t) -1;
344 minus = (uint64_t) -1;
346 if (cap_from_name(t, &cap) < 0) {
347 log_error("Failed to parse capability %s.", t);
351 if (c == ARG_CAPABILITY)
352 plus |= 1ULL << (uint64_t) cap;
354 minus |= 1ULL << (uint64_t) cap;
362 arg_link_journal = LINK_GUEST;
365 case ARG_LINK_JOURNAL:
366 if (streq(optarg, "auto"))
367 arg_link_journal = LINK_AUTO;
368 else if (streq(optarg, "no"))
369 arg_link_journal = LINK_NO;
370 else if (streq(optarg, "guest"))
371 arg_link_journal = LINK_GUEST;
372 else if (streq(optarg, "host"))
373 arg_link_journal = LINK_HOST;
375 log_error("Failed to parse link journal mode %s", optarg);
383 _cleanup_free_ char *a = NULL, *b = NULL;
387 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
389 e = strchr(optarg, ':');
391 a = strndup(optarg, e - optarg);
401 if (!path_is_absolute(a) || !path_is_absolute(b)) {
402 log_error("Invalid bind mount specification: %s", optarg);
406 r = strv_extend(x, a);
410 r = strv_extend(x, b);
420 if (!env_assignment_is_valid(optarg)) {
421 log_error("Environment variable assignment '%s' is not valid.", optarg);
425 n = strv_env_set(arg_setenv, optarg);
429 strv_free(arg_setenv);
438 case ARG_SHARE_SYSTEM:
439 arg_share_system = true;
443 r = parse_boolean(optarg);
445 log_error("Failed to parse --register= argument: %s", optarg);
453 arg_keep_unit = true;
460 assert_not_reached("Unhandled option");
464 if (arg_share_system)
465 arg_register = false;
467 if (arg_boot && arg_share_system) {
468 log_error("--boot and --share-system may not be combined.");
472 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
473 log_error("--keep-unit may not be used when invoked from a user session.");
477 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
482 static int mount_all(const char *dest) {
484 typedef struct MountPoint {
493 static const MountPoint mount_table[] = {
494 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
495 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
496 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
497 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
498 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
499 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
500 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
501 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
503 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
504 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
511 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
512 _cleanup_free_ char *where = NULL;
514 _cleanup_free_ char *options = NULL;
519 where = strjoin(dest, "/", mount_table[k].where, NULL);
523 t = path_is_mount_point(where, true);
525 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
533 /* Skip this entry if it is not a remount. */
534 if (mount_table[k].what && t > 0)
537 mkdir_p(where, 0755);
540 if (arg_selinux_apifs_context &&
541 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
542 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
549 o = mount_table[k].options;
552 if (mount(mount_table[k].what,
555 mount_table[k].flags,
557 mount_table[k].fatal) {
559 log_error("mount(%s) failed: %m", where);
569 static int mount_binds(const char *dest, char **l, unsigned long flags) {
572 STRV_FOREACH_PAIR(x, y, l) {
574 struct stat source_st, dest_st;
577 if (stat(*x, &source_st) < 0) {
578 log_error("failed to stat %s: %m", *x);
582 where = strappenda(dest, *y);
583 r = stat(where, &dest_st);
585 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
586 log_error("The file types of %s and %s do not match. Refusing bind mount",
590 } else if (errno == ENOENT) {
591 r = mkdir_parents_label(where, 0755);
593 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
597 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
600 /* Create the mount point, but be conservative -- refuse to create block
601 * and char devices. */
602 if (S_ISDIR(source_st.st_mode))
603 mkdir_label(where, 0755);
604 else if (S_ISFIFO(source_st.st_mode))
606 else if (S_ISSOCK(source_st.st_mode))
607 mknod(where, 0644 | S_IFSOCK, 0);
608 else if (S_ISREG(source_st.st_mode))
611 log_error("Refusing to create mountpoint for file: %s", *x);
615 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
616 log_error("mount(%s) failed: %m", where);
620 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
621 log_error("mount(%s) failed: %m", where);
629 static int setup_timezone(const char *dest) {
630 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
636 /* Fix the timezone, if possible */
637 r = readlink_malloc("/etc/localtime", &p);
639 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
643 z = path_startswith(p, "../usr/share/zoneinfo/");
645 z = path_startswith(p, "/usr/share/zoneinfo/");
647 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
651 where = strappend(dest, "/etc/localtime");
655 r = readlink_malloc(where, &q);
657 y = path_startswith(q, "../usr/share/zoneinfo/");
659 y = path_startswith(q, "/usr/share/zoneinfo/");
662 /* Already pointing to the right place? Then do nothing .. */
663 if (y && streq(y, z))
667 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
671 if (access(check, F_OK) < 0) {
672 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
676 what = strappend("../usr/share/zoneinfo/", z);
681 if (symlink(what, where) < 0) {
682 log_error("Failed to correct timezone of container: %m");
689 static int setup_resolv_conf(const char *dest) {
690 char _cleanup_free_ *where = NULL;
694 if (arg_private_network)
697 /* Fix resolv.conf, if possible */
698 where = strappend(dest, "/etc/resolv.conf");
702 /* We don't really care for the results of this really. If it
703 * fails, it fails, but meh... */
704 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
709 static int setup_boot_id(const char *dest) {
710 _cleanup_free_ char *from = NULL, *to = NULL;
717 if (arg_share_system)
720 /* Generate a new randomized boot ID, so that each boot-up of
721 * the container gets a new one */
723 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
724 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
728 r = sd_id128_randomize(&rnd);
730 log_error("Failed to generate random boot id: %s", strerror(-r));
734 snprintf(as_uuid, sizeof(as_uuid),
735 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
736 SD_ID128_FORMAT_VAL(rnd));
737 char_array_0(as_uuid);
739 r = write_string_file(from, as_uuid);
741 log_error("Failed to write boot id: %s", strerror(-r));
745 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
746 log_error("Failed to bind mount boot id: %m");
748 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
749 log_warning("Failed to make boot id read-only: %m");
755 static int copy_devnodes(const char *dest) {
757 static const char devnodes[] =
767 _cleanup_umask_ mode_t u;
773 NULSTR_FOREACH(d, devnodes) {
774 _cleanup_free_ char *from = NULL, *to = NULL;
777 from = strappend("/dev/", d);
778 to = strjoin(dest, "/dev/", d, NULL);
782 if (stat(from, &st) < 0) {
784 if (errno != ENOENT) {
785 log_error("Failed to stat %s: %m", from);
789 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
791 log_error("%s is not a char or block device, cannot copy", from);
794 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
796 log_error("mknod(%s) failed: %m", dest);
804 static int setup_ptmx(const char *dest) {
805 _cleanup_free_ char *p = NULL;
807 p = strappend(dest, "/dev/ptmx");
811 if (symlink("pts/ptmx", p) < 0) {
812 log_error("Failed to create /dev/ptmx symlink: %m");
819 static int setup_dev_console(const char *dest, const char *console) {
821 _cleanup_free_ char *to = NULL;
823 _cleanup_umask_ mode_t u;
830 if (stat(console, &st) < 0) {
831 log_error("Failed to stat %s: %m", console);
834 } else if (!S_ISCHR(st.st_mode)) {
835 log_error("/dev/console is not a char device");
839 r = chmod_and_chown(console, 0600, 0, 0);
841 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
845 if (asprintf(&to, "%s/dev/console", dest) < 0)
848 /* We need to bind mount the right tty to /dev/console since
849 * ptys can only exist on pts file systems. To have something
850 * to bind mount things on we create a device node first, that
851 * has the right major/minor (note that the major minor
852 * doesn't actually matter here, since we mount it over
855 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
856 log_error("mknod() for /dev/console failed: %m");
860 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
861 log_error("Bind mount for /dev/console failed: %m");
868 static int setup_kmsg(const char *dest, int kmsg_socket) {
869 _cleanup_free_ char *from = NULL, *to = NULL;
871 _cleanup_umask_ mode_t u;
873 struct cmsghdr cmsghdr;
874 uint8_t buf[CMSG_SPACE(sizeof(int))];
877 .msg_control = &control,
878 .msg_controllen = sizeof(control),
880 struct cmsghdr *cmsg;
883 assert(kmsg_socket >= 0);
887 /* We create the kmsg FIFO as /dev/kmsg, but immediately
888 * delete it after bind mounting it to /proc/kmsg. While FIFOs
889 * on the reading side behave very similar to /proc/kmsg,
890 * their writing side behaves differently from /dev/kmsg in
891 * that writing blocks when nothing is reading. In order to
892 * avoid any problems with containers deadlocking due to this
893 * we simply make /dev/kmsg unavailable to the container. */
894 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
895 asprintf(&to, "%s/proc/kmsg", dest) < 0)
898 if (mkfifo(from, 0600) < 0) {
899 log_error("mkfifo() for /dev/kmsg failed: %m");
903 r = chmod_and_chown(from, 0600, 0, 0);
905 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
909 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
910 log_error("Bind mount for /proc/kmsg failed: %m");
914 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
916 log_error("Failed to open fifo: %m");
920 cmsg = CMSG_FIRSTHDR(&mh);
921 cmsg->cmsg_level = SOL_SOCKET;
922 cmsg->cmsg_type = SCM_RIGHTS;
923 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
924 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
926 mh.msg_controllen = cmsg->cmsg_len;
928 /* Store away the fd in the socket, so that it stays open as
929 * long as we run the child */
930 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
931 close_nointr_nofail(fd);
934 log_error("Failed to send FIFO fd: %m");
938 /* And now make the FIFO unavailable as /dev/kmsg... */
943 static int setup_hostname(void) {
945 if (arg_share_system)
948 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
954 static int setup_journal(const char *directory) {
955 sd_id128_t machine_id, this_id;
956 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
960 p = strappend(directory, "/etc/machine-id");
964 r = read_one_line_file(p, &b);
965 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
968 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
973 if (isempty(id) && arg_link_journal == LINK_AUTO)
976 /* Verify validity */
977 r = sd_id128_from_string(id, &machine_id);
979 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
983 r = sd_id128_get_machine(&this_id);
985 log_error("Failed to retrieve machine ID: %s", strerror(-r));
989 if (sd_id128_equal(machine_id, this_id)) {
990 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
991 "Host and machine ids are equal (%s): refusing to link journals", id);
992 if (arg_link_journal == LINK_AUTO)
998 if (arg_link_journal == LINK_NO)
1002 p = strappend("/var/log/journal/", id);
1003 q = strjoin(directory, "/var/log/journal/", id, NULL);
1007 if (path_is_mount_point(p, false) > 0) {
1008 if (arg_link_journal != LINK_AUTO) {
1009 log_error("%s: already a mount point, refusing to use for journal", p);
1016 if (path_is_mount_point(q, false) > 0) {
1017 if (arg_link_journal != LINK_AUTO) {
1018 log_error("%s: already a mount point, refusing to use for journal", q);
1025 r = readlink_and_make_absolute(p, &d);
1027 if ((arg_link_journal == LINK_GUEST ||
1028 arg_link_journal == LINK_AUTO) &&
1031 r = mkdir_p(q, 0755);
1033 log_warning("failed to create directory %s: %m", q);
1037 if (unlink(p) < 0) {
1038 log_error("Failed to remove symlink %s: %m", p);
1041 } else if (r == -EINVAL) {
1043 if (arg_link_journal == LINK_GUEST &&
1046 if (errno == ENOTDIR) {
1047 log_error("%s already exists and is neither a symlink nor a directory", p);
1050 log_error("Failed to remove %s: %m", p);
1054 } else if (r != -ENOENT) {
1055 log_error("readlink(%s) failed: %m", p);
1059 if (arg_link_journal == LINK_GUEST) {
1061 if (symlink(q, p) < 0) {
1062 log_error("Failed to symlink %s to %s: %m", q, p);
1066 r = mkdir_p(q, 0755);
1068 log_warning("failed to create directory %s: %m", q);
1072 if (arg_link_journal == LINK_HOST) {
1073 r = mkdir_p(p, 0755);
1075 log_error("Failed to create %s: %m", p);
1079 } else if (access(p, F_OK) < 0)
1082 if (dir_is_empty(q) == 0) {
1083 log_error("%s not empty.", q);
1087 r = mkdir_p(q, 0755);
1089 log_error("Failed to create %s: %m", q);
1093 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1094 log_error("Failed to bind mount journal from host into guest: %m");
1101 static int setup_kdbus(const char *dest, const char *path) {
1107 p = strappenda(dest, "/dev/kdbus");
1108 if (mkdir(p, 0755) < 0) {
1109 log_error("Failed to create kdbus path: %m");
1113 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1114 log_error("Failed to mount kdbus domain path: %m");
1121 static int drop_capabilities(void) {
1122 return capability_bounding_set_drop(~arg_retain, false);
1125 static int register_machine(pid_t pid) {
1126 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1127 _cleanup_bus_unref_ sd_bus *bus = NULL;
1133 r = sd_bus_default_system(&bus);
1135 log_error("Failed to open system bus: %s", strerror(-r));
1139 if (arg_keep_unit) {
1140 r = sd_bus_call_method(
1142 "org.freedesktop.machine1",
1143 "/org/freedesktop/machine1",
1144 "org.freedesktop.machine1.Manager",
1150 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1154 strempty(arg_directory));
1156 r = sd_bus_call_method(
1158 "org.freedesktop.machine1",
1159 "/org/freedesktop/machine1",
1160 "org.freedesktop.machine1.Manager",
1166 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1170 strempty(arg_directory),
1171 !isempty(arg_slice), "Slice", "s", arg_slice);
1175 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1182 static int terminate_machine(pid_t pid) {
1183 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1184 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1185 _cleanup_bus_unref_ sd_bus *bus = NULL;
1192 r = sd_bus_default_system(&bus);
1194 log_error("Failed to open system bus: %s", strerror(-r));
1198 r = sd_bus_call_method(
1200 "org.freedesktop.machine1",
1201 "/org/freedesktop/machine1",
1202 "org.freedesktop.machine1.Manager",
1209 /* Note that the machine might already have been
1210 * cleaned up automatically, hence don't consider it a
1211 * failure if we cannot get the machine object. */
1212 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1216 r = sd_bus_message_read(reply, "o", &path);
1218 return bus_log_parse_error(r);
1220 r = sd_bus_call_method(
1222 "org.freedesktop.machine1",
1224 "org.freedesktop.machine1.Machine",
1230 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1237 static int reset_audit_loginuid(void) {
1238 _cleanup_free_ char *p = NULL;
1241 if (arg_share_system)
1244 r = read_one_line_file("/proc/self/loginuid", &p);
1248 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1252 /* Already reset? */
1253 if (streq(p, "4294967295"))
1256 r = write_string_file("/proc/self/loginuid", "4294967295");
1258 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1259 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1260 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1261 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1262 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1270 static int setup_veth(int netns_fd) {
1271 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1272 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1273 char iface_name[IFNAMSIZ] = "ve-";
1276 if (!arg_private_network)
1279 if (!arg_network_veth)
1282 strncpy(iface_name+3, arg_machine, sizeof(iface_name) - 3);
1284 r = sd_rtnl_open(0, &rtnl);
1286 log_error("Failed to connect to netlink: %s", strerror(-r));
1290 r = sd_rtnl_message_new_link(RTM_NEWLINK, 0, &m);
1292 log_error("Failed to allocate netlink message: %s", strerror(-r));
1296 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1298 log_error("Failed to append netlink kind: %s", strerror(-r));
1302 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO, 0);
1304 log_error("Failed to open netlink container: %s", strerror(-r));
1308 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1310 log_error("Failed to append netlink kind: %s", strerror(-r));
1314 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA, 0);
1316 log_error("Failed to open netlink container: %s", strerror(-r));
1320 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER, sizeof(struct ifinfomsg));
1322 log_error("z Failed to open netlink container: %s", strerror(-r));
1326 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1328 log_error("Failed to append netlink kind: %s", strerror(-r));
1332 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
1334 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1338 r = sd_rtnl_message_close_container(m);
1340 log_error("Failed to close netlink container: %s", strerror(-r));
1344 r = sd_rtnl_message_close_container(m);
1346 log_error("Failed to close netlink container: %s", strerror(-r));
1350 r = sd_rtnl_message_close_container(m);
1352 log_error("Failed to close netlink container: %s", strerror(-r));
1356 r = sd_rtnl_call(rtnl, m, 0, NULL);
1358 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1365 static int move_network_interfaces(pid_t pid) {
1366 _cleanup_udev_unref_ struct udev *udev = NULL;
1367 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1371 if (!arg_private_network)
1374 if (strv_isempty(arg_network_interfaces))
1377 r = sd_rtnl_open(0, &rtnl);
1379 log_error("Failed to connect to netlink: %s", strerror(-r));
1385 log_error("Failed to connect to udev.");
1389 STRV_FOREACH(i, arg_network_interfaces) {
1390 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1391 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1392 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1395 ifi = (int) if_nametoindex(*i);
1397 log_error("Failed to resolve interface %s: %m", *i);
1401 sprintf(ifi_str, "n%i", ifi);
1402 d = udev_device_new_from_device_id(udev, ifi_str);
1404 log_error("Failed to get udev device for interface %s: %m", *i);
1408 if (udev_device_get_is_initialized(d) <= 0) {
1409 log_error("Network interface %s is not initialized yet.", *i);
1413 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1415 log_error("Failed to allocate netlink message: %s", strerror(-r));
1419 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1421 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1425 r = sd_rtnl_call(rtnl, m, 0, NULL);
1427 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1435 int main(int argc, char *argv[]) {
1437 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1, netns_fd = -1;
1438 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1439 _cleanup_free_ char *kdbus_domain = NULL;
1440 _cleanup_fdset_free_ FDSet *fds = NULL;
1441 const char *console = NULL;
1442 int r = EXIT_FAILURE, k;
1447 log_parse_environment();
1450 k = parse_argv(argc, argv);
1458 if (arg_directory) {
1461 p = path_make_absolute_cwd(arg_directory);
1462 free(arg_directory);
1465 arg_directory = get_current_dir_name();
1467 if (!arg_directory) {
1468 log_error("Failed to determine path, please use -D.");
1472 path_kill_slashes(arg_directory);
1475 arg_machine = strdup(basename(arg_directory));
1481 hostname_cleanup(arg_machine, false);
1482 if (isempty(arg_machine)) {
1483 log_error("Failed to determine machine name automatically, please use -M.");
1488 if (geteuid() != 0) {
1489 log_error("Need to be root.");
1493 if (sd_booted() <= 0) {
1494 log_error("Not running on a systemd system.");
1498 if (path_equal(arg_directory, "/")) {
1499 log_error("Spawning container on root directory not supported.");
1503 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1504 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1509 n_fd_passed = sd_listen_fds(false);
1510 if (n_fd_passed > 0) {
1511 k = fdset_new_listen_fds(&fds, false);
1513 log_error("Failed to collect file descriptors: %s", strerror(-k));
1517 fdset_close_others(fds);
1520 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1522 log_error("Failed to acquire pseudo tty: %m");
1526 console = ptsname(master);
1528 log_error("Failed to determine tty name: %m");
1533 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1535 if (unlockpt(master) < 0) {
1536 log_error("Failed to unlock tty: %m");
1540 if (arg_network_veth) {
1541 netns_fd = open("/proc/self/ns/net", O_RDWR|O_CLOEXEC);
1543 log_error("Failed to open network namespace fd: %m");
1548 if (access("/dev/kdbus/control", F_OK) >= 0) {
1550 if (arg_share_system) {
1551 kdbus_domain = strdup("/dev/kdbus");
1552 if (!kdbus_domain) {
1559 ns = strappenda("machine-", arg_machine);
1560 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1562 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1564 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1568 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1569 log_error("Failed to create kmsg socket pair: %m");
1573 sd_notify(0, "READY=1");
1575 assert_se(sigemptyset(&mask) == 0);
1576 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1577 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1582 sync_fd = eventfd(0, EFD_CLOEXEC);
1584 log_error("Failed to create event fd: %m");
1588 pid = syscall(__NR_clone,
1589 SIGCHLD|CLONE_NEWNS|
1590 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1591 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1593 if (errno == EINVAL)
1594 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1596 log_error("clone() failed: %m");
1603 const char *home = NULL;
1604 uid_t uid = (uid_t) -1;
1605 gid_t gid = (gid_t) -1;
1607 const char *envp[] = {
1608 "PATH=" DEFAULT_PATH_SPLIT_USR,
1609 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1614 NULL, /* container_uuid */
1615 NULL, /* LISTEN_FDS */
1616 NULL, /* LISTEN_PID */
1622 envp[n_env] = strv_find_prefix(environ, "TERM=");
1626 close_nointr_nofail(master);
1629 close_nointr(STDIN_FILENO);
1630 close_nointr(STDOUT_FILENO);
1631 close_nointr(STDERR_FILENO);
1633 close_nointr_nofail(kmsg_socket_pair[0]);
1634 kmsg_socket_pair[0] = -1;
1636 reset_all_signal_handlers();
1638 assert_se(sigemptyset(&mask) == 0);
1639 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1641 k = open_terminal(console, O_RDWR);
1642 if (k != STDIN_FILENO) {
1644 close_nointr_nofail(k);
1648 log_error("Failed to open console: %s", strerror(-k));
1652 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1653 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1654 log_error("Failed to duplicate console: %m");
1659 log_error("setsid() failed: %m");
1663 if (reset_audit_loginuid() < 0)
1666 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1667 log_error("PR_SET_PDEATHSIG failed: %m");
1671 /* Mark everything as slave, so that we still
1672 * receive mounts from the real root, but don't
1673 * propagate mounts to the real root. */
1674 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1675 log_error("MS_SLAVE|MS_REC failed: %m");
1679 /* Turn directory into bind mount */
1680 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1681 log_error("Failed to make bind mount.");
1686 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1687 log_error("Failed to make read-only.");
1691 if (mount_all(arg_directory) < 0)
1694 if (copy_devnodes(arg_directory) < 0)
1697 if (setup_ptmx(arg_directory) < 0)
1700 dev_setup(arg_directory);
1702 if (setup_veth(netns_fd) < 0)
1705 if (netns_fd >= 0) {
1706 close_nointr_nofail(netns_fd);
1710 if (setup_dev_console(arg_directory, console) < 0)
1713 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1716 close_nointr_nofail(kmsg_socket_pair[1]);
1717 kmsg_socket_pair[1] = -1;
1719 if (setup_boot_id(arg_directory) < 0)
1722 if (setup_timezone(arg_directory) < 0)
1725 if (setup_resolv_conf(arg_directory) < 0)
1728 if (setup_journal(arg_directory) < 0)
1731 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1734 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1737 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1740 if (chdir(arg_directory) < 0) {
1741 log_error("chdir(%s) failed: %m", arg_directory);
1745 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1746 log_error("mount(MS_MOVE) failed: %m");
1750 if (chroot(".") < 0) {
1751 log_error("chroot() failed: %m");
1755 if (chdir("/") < 0) {
1756 log_error("chdir() failed: %m");
1762 if (arg_private_network)
1765 if (drop_capabilities() < 0) {
1766 log_error("drop_capabilities() failed: %m");
1772 /* Note that this resolves user names
1773 * inside the container, and hence
1774 * accesses the NSS modules from the
1775 * container and not the host. This is
1778 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1779 log_error("get_user_creds() failed: %m");
1783 if (mkdir_parents_label(home, 0775) < 0) {
1784 log_error("mkdir_parents_label() failed: %m");
1788 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1789 log_error("mkdir_safe_label() failed: %m");
1793 if (initgroups((const char*)arg_user, gid) < 0) {
1794 log_error("initgroups() failed: %m");
1798 if (setresgid(gid, gid, gid) < 0) {
1799 log_error("setregid() failed: %m");
1803 if (setresuid(uid, uid, uid) < 0) {
1804 log_error("setreuid() failed: %m");
1808 /* Reset everything fully to 0, just in case */
1810 if (setgroups(0, NULL) < 0) {
1811 log_error("setgroups() failed: %m");
1815 if (setresgid(0, 0, 0) < 0) {
1816 log_error("setregid() failed: %m");
1820 if (setresuid(0, 0, 0) < 0) {
1821 log_error("setreuid() failed: %m");
1826 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1827 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1828 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1833 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1834 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1840 if (fdset_size(fds) > 0) {
1841 k = fdset_cloexec(fds, false);
1843 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1847 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1848 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1856 eventfd_read(sync_fd, &x);
1857 close_nointr_nofail(sync_fd);
1860 if (!strv_isempty(arg_setenv)) {
1863 n = strv_env_merge(2, envp, arg_setenv);
1871 env_use = (char**) envp;
1874 if (arg_selinux_context)
1875 if (setexeccon(arg_selinux_context) < 0)
1876 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1882 /* Automatically search for the init system */
1884 l = 1 + argc - optind;
1885 a = newa(char*, l + 1);
1886 memcpy(a + 1, argv + optind, l * sizeof(char*));
1888 a[0] = (char*) "/usr/lib/systemd/systemd";
1889 execve(a[0], a, env_use);
1891 a[0] = (char*) "/lib/systemd/systemd";
1892 execve(a[0], a, env_use);
1894 a[0] = (char*) "/sbin/init";
1895 execve(a[0], a, env_use);
1896 } else if (argc > optind)
1897 execvpe(argv[optind], argv + optind, env_use);
1899 chdir(home ? home : "/root");
1900 execle("/bin/bash", "-bash", NULL, env_use);
1903 log_error("execv() failed: %m");
1906 _exit(EXIT_FAILURE);
1912 r = register_machine(pid);
1916 r = move_network_interfaces(pid);
1920 eventfd_write(sync_fd, 1);
1921 close_nointr_nofail(sync_fd);
1924 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1933 /* Kill if it is not dead yet anyway */
1934 terminate_machine(pid);
1936 /* Redundant, but better safe than sorry */
1939 k = wait_for_terminate(pid, &status);
1947 if (status.si_code == CLD_EXITED) {
1948 r = status.si_status;
1949 if (status.si_status != 0) {
1950 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1955 log_debug("Container %s exited successfully.", arg_machine);
1957 } else if (status.si_code == CLD_KILLED &&
1958 status.si_status == SIGINT) {
1961 log_info("Container %s has been shut down.", arg_machine);
1964 } else if (status.si_code == CLD_KILLED &&
1965 status.si_status == SIGHUP) {
1968 log_info("Container %s is being rebooted.", arg_machine);
1970 } else if (status.si_code == CLD_KILLED ||
1971 status.si_code == CLD_DUMPED) {
1973 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1977 log_error("Container %s failed due to unknown reason.", arg_machine);
1987 free(arg_directory);
1990 free(arg_network_interfaces);