1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <selinux/selinux.h>
48 #include "sd-daemon.h"
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
66 #include "bus-error.h"
68 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
112 (1ULL << CAP_SYS_RESOURCE) |
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
115 (1ULL << CAP_AUDIT_CONTROL) |
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122 static bool arg_register = true;
123 static bool arg_keep_unit = false;
125 static int help(void) {
127 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
128 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
129 " -h --help Show this help\n"
130 " --version Print version string\n"
131 " -D --directory=NAME Root directory for the container\n"
132 " -b --boot Boot up full system (i.e. invoke init)\n"
133 " -u --user=USER Run the command under specified user or uid\n"
134 " --uuid=UUID Set a specific machine UUID for the container\n"
135 " -M --machine=NAME Set the machine name for the container\n"
136 " -S --slice=SLICE Place the container in the specified slice\n"
137 " -Z --selinux-context=SECLABEL\n"
138 " Set the SELinux security context to be used by\n"
139 " processes in the container\n"
140 " -L --selinux-apifs-context=SECLABEL\n"
141 " Set the SELinux security context to be used by\n"
142 " API/tmpfs file systems in the container\n"
143 " --private-network Disable network in container\n"
144 " --share-system Share system namespaces with host\n"
145 " --read-only Mount the root directory read-only\n"
146 " --capability=CAP In addition to the default, retain specified\n"
148 " --drop-capability=CAP Drop the specified capability from the default set\n"
149 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
150 " -j Equivalent to --link-journal=host\n"
151 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
153 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
154 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
155 " --register=BOOLEAN Register container as machine\n"
156 " --keep-unit Do not register a scope for the machine, reuse\n"
157 " the service unit nspawn is running in\n"
158 " -q --quiet Do not show status information\n",
159 program_invocation_short_name);
164 static int parse_argv(int argc, char *argv[]) {
182 static const struct option options[] = {
183 { "help", no_argument, NULL, 'h' },
184 { "version", no_argument, NULL, ARG_VERSION },
185 { "directory", required_argument, NULL, 'D' },
186 { "user", required_argument, NULL, 'u' },
187 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
188 { "boot", no_argument, NULL, 'b' },
189 { "uuid", required_argument, NULL, ARG_UUID },
190 { "read-only", no_argument, NULL, ARG_READ_ONLY },
191 { "capability", required_argument, NULL, ARG_CAPABILITY },
192 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
193 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
194 { "bind", required_argument, NULL, ARG_BIND },
195 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
196 { "machine", required_argument, NULL, 'M' },
197 { "slice", required_argument, NULL, 'S' },
198 { "setenv", required_argument, NULL, ARG_SETENV },
199 { "selinux-context", required_argument, NULL, 'Z' },
200 { "selinux-apifs-context", required_argument, NULL, 'L' },
201 { "quiet", no_argument, NULL, 'q' },
202 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
203 { "register", required_argument, NULL, ARG_REGISTER },
204 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
213 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
221 puts(PACKAGE_STRING);
222 puts(SYSTEMD_FEATURES);
227 arg_directory = canonicalize_file_name(optarg);
228 if (!arg_directory) {
229 log_error("Invalid root directory: %m");
237 arg_user = strdup(optarg);
243 case ARG_PRIVATE_NETWORK:
244 arg_private_network = true;
252 r = sd_id128_from_string(optarg, &arg_uuid);
254 log_error("Invalid UUID: %s", optarg);
260 arg_slice = strdup(optarg);
267 if (isempty(optarg)) {
272 if (!hostname_is_valid(optarg)) {
273 log_error("Invalid machine name: %s", optarg);
278 arg_machine = strdup(optarg);
286 arg_selinux_context = optarg;
290 arg_selinux_apifs_context = optarg;
294 arg_read_only = true;
298 case ARG_DROP_CAPABILITY: {
302 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
306 t = strndup(word, length);
310 if (cap_from_name(t, &cap) < 0) {
311 log_error("Failed to parse capability %s.", t);
318 if (c == ARG_CAPABILITY)
319 arg_retain |= 1ULL << (uint64_t) cap;
321 arg_retain &= ~(1ULL << (uint64_t) cap);
328 arg_link_journal = LINK_GUEST;
331 case ARG_LINK_JOURNAL:
332 if (streq(optarg, "auto"))
333 arg_link_journal = LINK_AUTO;
334 else if (streq(optarg, "no"))
335 arg_link_journal = LINK_NO;
336 else if (streq(optarg, "guest"))
337 arg_link_journal = LINK_GUEST;
338 else if (streq(optarg, "host"))
339 arg_link_journal = LINK_HOST;
341 log_error("Failed to parse link journal mode %s", optarg);
349 _cleanup_free_ char *a = NULL, *b = NULL;
353 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
355 e = strchr(optarg, ':');
357 a = strndup(optarg, e - optarg);
367 if (!path_is_absolute(a) || !path_is_absolute(b)) {
368 log_error("Invalid bind mount specification: %s", optarg);
372 r = strv_extend(x, a);
376 r = strv_extend(x, b);
386 if (!env_assignment_is_valid(optarg)) {
387 log_error("Environment variable assignment '%s' is not valid.", optarg);
391 n = strv_env_set(arg_setenv, optarg);
395 strv_free(arg_setenv);
404 case ARG_SHARE_SYSTEM:
405 arg_share_system = true;
409 r = parse_boolean(optarg);
411 log_error("Failed to parse --register= argument: %s", optarg);
419 arg_keep_unit = true;
426 assert_not_reached("Unhandled option");
430 if (arg_share_system)
431 arg_register = false;
433 if (arg_boot && arg_share_system) {
434 log_error("--boot and --share-system may not be combined.");
438 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
439 log_error("--keep-unit may not be used when invoked from a user session.");
446 static int mount_all(const char *dest) {
448 typedef struct MountPoint {
457 static const MountPoint mount_table[] = {
458 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
459 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
460 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
461 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
462 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
463 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
464 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
465 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
467 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
468 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
475 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
476 _cleanup_free_ char *where = NULL;
478 _cleanup_free_ char *options = NULL;
483 where = strjoin(dest, "/", mount_table[k].where, NULL);
487 t = path_is_mount_point(where, true);
489 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
497 /* Skip this entry if it is not a remount. */
498 if (mount_table[k].what && t > 0)
501 mkdir_p(where, 0755);
504 if (arg_selinux_apifs_context &&
505 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
506 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
513 o = mount_table[k].options;
516 if (mount(mount_table[k].what,
519 mount_table[k].flags,
521 mount_table[k].fatal) {
523 log_error("mount(%s) failed: %m", where);
533 static int mount_binds(const char *dest, char **l, unsigned long flags) {
536 STRV_FOREACH_PAIR(x, y, l) {
538 struct stat source_st, dest_st;
541 if (stat(*x, &source_st) < 0) {
542 log_error("failed to stat %s: %m", *x);
546 where = strappenda(dest, *y);
547 r = stat(where, &dest_st);
549 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
550 log_error("The file types of %s and %s do not match. Refusing bind mount",
554 } else if (errno == ENOENT) {
555 r = mkdir_parents_label(where, 0755);
557 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
561 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
564 /* Create the mount point, but be conservative -- refuse to create block
565 * and char devices. */
566 if (S_ISDIR(source_st.st_mode))
567 mkdir_label(where, 0755);
568 else if (S_ISFIFO(source_st.st_mode))
570 else if (S_ISSOCK(source_st.st_mode))
571 mknod(where, 0644 | S_IFSOCK, 0);
572 else if (S_ISREG(source_st.st_mode))
575 log_error("Refusing to create mountpoint for file: %s", *x);
579 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
580 log_error("mount(%s) failed: %m", where);
584 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
585 log_error("mount(%s) failed: %m", where);
593 static int setup_timezone(const char *dest) {
594 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
600 /* Fix the timezone, if possible */
601 r = readlink_malloc("/etc/localtime", &p);
603 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
607 z = path_startswith(p, "../usr/share/zoneinfo/");
609 z = path_startswith(p, "/usr/share/zoneinfo/");
611 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
615 where = strappend(dest, "/etc/localtime");
619 r = readlink_malloc(where, &q);
621 y = path_startswith(q, "../usr/share/zoneinfo/");
623 y = path_startswith(q, "/usr/share/zoneinfo/");
626 /* Already pointing to the right place? Then do nothing .. */
627 if (y && streq(y, z))
631 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
635 if (access(check, F_OK) < 0) {
636 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
640 what = strappend("../usr/share/zoneinfo/", z);
645 if (symlink(what, where) < 0) {
646 log_error("Failed to correct timezone of container: %m");
653 static int setup_resolv_conf(const char *dest) {
654 char _cleanup_free_ *where = NULL;
658 if (arg_private_network)
661 /* Fix resolv.conf, if possible */
662 where = strappend(dest, "/etc/resolv.conf");
666 /* We don't really care for the results of this really. If it
667 * fails, it fails, but meh... */
668 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
673 static int setup_boot_id(const char *dest) {
674 _cleanup_free_ char *from = NULL, *to = NULL;
681 if (arg_share_system)
684 /* Generate a new randomized boot ID, so that each boot-up of
685 * the container gets a new one */
687 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
688 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
692 r = sd_id128_randomize(&rnd);
694 log_error("Failed to generate random boot id: %s", strerror(-r));
698 snprintf(as_uuid, sizeof(as_uuid),
699 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
700 SD_ID128_FORMAT_VAL(rnd));
701 char_array_0(as_uuid);
703 r = write_string_file(from, as_uuid);
705 log_error("Failed to write boot id: %s", strerror(-r));
709 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
710 log_error("Failed to bind mount boot id: %m");
712 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
713 log_warning("Failed to make boot id read-only: %m");
719 static int copy_devnodes(const char *dest) {
721 static const char devnodes[] =
731 _cleanup_umask_ mode_t u;
737 NULSTR_FOREACH(d, devnodes) {
738 _cleanup_free_ char *from = NULL, *to = NULL;
741 from = strappend("/dev/", d);
742 to = strjoin(dest, "/dev/", d, NULL);
746 if (stat(from, &st) < 0) {
748 if (errno != ENOENT) {
749 log_error("Failed to stat %s: %m", from);
753 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
755 log_error("%s is not a char or block device, cannot copy", from);
758 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
760 log_error("mknod(%s) failed: %m", dest);
768 static int setup_ptmx(const char *dest) {
769 _cleanup_free_ char *p = NULL;
771 p = strappend(dest, "/dev/ptmx");
775 if (symlink("pts/ptmx", p) < 0) {
776 log_error("Failed to create /dev/ptmx symlink: %m");
783 static int setup_dev_console(const char *dest, const char *console) {
785 _cleanup_free_ char *to = NULL;
787 _cleanup_umask_ mode_t u;
794 if (stat(console, &st) < 0) {
795 log_error("Failed to stat %s: %m", console);
798 } else if (!S_ISCHR(st.st_mode)) {
799 log_error("/dev/console is not a char device");
803 r = chmod_and_chown(console, 0600, 0, 0);
805 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
809 if (asprintf(&to, "%s/dev/console", dest) < 0)
812 /* We need to bind mount the right tty to /dev/console since
813 * ptys can only exist on pts file systems. To have something
814 * to bind mount things on we create a device node first, that
815 * has the right major/minor (note that the major minor
816 * doesn't actually matter here, since we mount it over
819 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
820 log_error("mknod() for /dev/console failed: %m");
824 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
825 log_error("Bind mount for /dev/console failed: %m");
832 static int setup_kmsg(const char *dest, int kmsg_socket) {
833 _cleanup_free_ char *from = NULL, *to = NULL;
835 _cleanup_umask_ mode_t u;
837 struct cmsghdr cmsghdr;
838 uint8_t buf[CMSG_SPACE(sizeof(int))];
841 .msg_control = &control,
842 .msg_controllen = sizeof(control),
844 struct cmsghdr *cmsg;
847 assert(kmsg_socket >= 0);
851 /* We create the kmsg FIFO as /dev/kmsg, but immediately
852 * delete it after bind mounting it to /proc/kmsg. While FIFOs
853 * on the reading side behave very similar to /proc/kmsg,
854 * their writing side behaves differently from /dev/kmsg in
855 * that writing blocks when nothing is reading. In order to
856 * avoid any problems with containers deadlocking due to this
857 * we simply make /dev/kmsg unavailable to the container. */
858 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
859 asprintf(&to, "%s/proc/kmsg", dest) < 0)
862 if (mkfifo(from, 0600) < 0) {
863 log_error("mkfifo() for /dev/kmsg failed: %m");
867 r = chmod_and_chown(from, 0600, 0, 0);
869 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
873 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
874 log_error("Bind mount for /proc/kmsg failed: %m");
878 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
880 log_error("Failed to open fifo: %m");
884 cmsg = CMSG_FIRSTHDR(&mh);
885 cmsg->cmsg_level = SOL_SOCKET;
886 cmsg->cmsg_type = SCM_RIGHTS;
887 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
888 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
890 mh.msg_controllen = cmsg->cmsg_len;
892 /* Store away the fd in the socket, so that it stays open as
893 * long as we run the child */
894 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
895 close_nointr_nofail(fd);
898 log_error("Failed to send FIFO fd: %m");
902 /* And now make the FIFO unavailable as /dev/kmsg... */
907 static int setup_hostname(void) {
909 if (arg_share_system)
912 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
918 static int setup_journal(const char *directory) {
919 sd_id128_t machine_id, this_id;
920 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
924 p = strappend(directory, "/etc/machine-id");
928 r = read_one_line_file(p, &b);
929 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
932 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
937 if (isempty(id) && arg_link_journal == LINK_AUTO)
940 /* Verify validity */
941 r = sd_id128_from_string(id, &machine_id);
943 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
947 r = sd_id128_get_machine(&this_id);
949 log_error("Failed to retrieve machine ID: %s", strerror(-r));
953 if (sd_id128_equal(machine_id, this_id)) {
954 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
955 "Host and machine ids are equal (%s): refusing to link journals", id);
956 if (arg_link_journal == LINK_AUTO)
962 if (arg_link_journal == LINK_NO)
966 p = strappend("/var/log/journal/", id);
967 q = strjoin(directory, "/var/log/journal/", id, NULL);
971 if (path_is_mount_point(p, false) > 0) {
972 if (arg_link_journal != LINK_AUTO) {
973 log_error("%s: already a mount point, refusing to use for journal", p);
980 if (path_is_mount_point(q, false) > 0) {
981 if (arg_link_journal != LINK_AUTO) {
982 log_error("%s: already a mount point, refusing to use for journal", q);
989 r = readlink_and_make_absolute(p, &d);
991 if ((arg_link_journal == LINK_GUEST ||
992 arg_link_journal == LINK_AUTO) &&
995 r = mkdir_p(q, 0755);
997 log_warning("failed to create directory %s: %m", q);
1001 if (unlink(p) < 0) {
1002 log_error("Failed to remove symlink %s: %m", p);
1005 } else if (r == -EINVAL) {
1007 if (arg_link_journal == LINK_GUEST &&
1010 if (errno == ENOTDIR) {
1011 log_error("%s already exists and is neither a symlink nor a directory", p);
1014 log_error("Failed to remove %s: %m", p);
1018 } else if (r != -ENOENT) {
1019 log_error("readlink(%s) failed: %m", p);
1023 if (arg_link_journal == LINK_GUEST) {
1025 if (symlink(q, p) < 0) {
1026 log_error("Failed to symlink %s to %s: %m", q, p);
1030 r = mkdir_p(q, 0755);
1032 log_warning("failed to create directory %s: %m", q);
1036 if (arg_link_journal == LINK_HOST) {
1037 r = mkdir_p(p, 0755);
1039 log_error("Failed to create %s: %m", p);
1043 } else if (access(p, F_OK) < 0)
1046 if (dir_is_empty(q) == 0) {
1047 log_error("%s not empty.", q);
1051 r = mkdir_p(q, 0755);
1053 log_error("Failed to create %s: %m", q);
1057 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1058 log_error("Failed to bind mount journal from host into guest: %m");
1065 static int setup_kdbus(const char *dest, const char *path) {
1071 p = strappenda(dest, "/dev/kdbus");
1072 if (mkdir(p, 0755) < 0) {
1073 log_error("Failed to create kdbus path: %m");
1077 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1078 log_error("Failed to mount kdbus domain path: %m");
1085 static int drop_capabilities(void) {
1086 return capability_bounding_set_drop(~arg_retain, false);
1089 static int register_machine(pid_t pid) {
1090 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1091 _cleanup_bus_unref_ sd_bus *bus = NULL;
1097 r = sd_bus_default_system(&bus);
1099 log_error("Failed to open system bus: %s", strerror(-r));
1103 if (arg_keep_unit) {
1104 r = sd_bus_call_method(
1106 "org.freedesktop.machine1",
1107 "/org/freedesktop/machine1",
1108 "org.freedesktop.machine1.Manager",
1114 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1118 strempty(arg_directory));
1120 r = sd_bus_call_method(
1122 "org.freedesktop.machine1",
1123 "/org/freedesktop/machine1",
1124 "org.freedesktop.machine1.Manager",
1130 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1134 strempty(arg_directory),
1135 !isempty(arg_slice), "Slice", "s", arg_slice);
1139 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1146 static int terminate_machine(pid_t pid) {
1147 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1148 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1149 _cleanup_bus_unref_ sd_bus *bus = NULL;
1156 r = sd_bus_default_system(&bus);
1158 log_error("Failed to open system bus: %s", strerror(-r));
1162 r = sd_bus_call_method(
1164 "org.freedesktop.machine1",
1165 "/org/freedesktop/machine1",
1166 "org.freedesktop.machine1.Manager",
1173 /* Note that the machine might already have been
1174 * cleaned up automatically, hence don't consider it a
1175 * failure if we cannot get the machine object. */
1176 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1180 r = sd_bus_message_read(reply, "o", &path);
1182 return bus_log_parse_error(r);
1184 r = sd_bus_call_method(
1186 "org.freedesktop.machine1",
1188 "org.freedesktop.machine1.Machine",
1194 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1201 static bool audit_enabled(void) {
1204 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1206 close_nointr_nofail(fd);
1212 int main(int argc, char *argv[]) {
1214 int r = EXIT_FAILURE, k;
1215 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1217 const char *console = NULL;
1219 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1220 _cleanup_fdset_free_ FDSet *fds = NULL;
1221 _cleanup_free_ char *kdbus_domain = NULL;
1223 log_parse_environment();
1226 k = parse_argv(argc, argv);
1234 if (arg_directory) {
1237 p = path_make_absolute_cwd(arg_directory);
1238 free(arg_directory);
1241 arg_directory = get_current_dir_name();
1243 if (!arg_directory) {
1244 log_error("Failed to determine path, please use -D.");
1248 path_kill_slashes(arg_directory);
1251 arg_machine = strdup(basename(arg_directory));
1257 hostname_cleanup(arg_machine, false);
1258 if (isempty(arg_machine)) {
1259 log_error("Failed to determine machine name automatically, please use -M.");
1264 if (geteuid() != 0) {
1265 log_error("Need to be root.");
1269 if (sd_booted() <= 0) {
1270 log_error("Not running on a systemd system.");
1274 if (arg_boot && audit_enabled()) {
1275 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1276 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1277 "line before using systemd-nspawn. Sleeping for 5s...\n");
1281 if (path_equal(arg_directory, "/")) {
1282 log_error("Spawning container on root directory not supported.");
1286 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1287 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1292 n_fd_passed = sd_listen_fds(false);
1293 if (n_fd_passed > 0) {
1294 k = fdset_new_listen_fds(&fds, false);
1296 log_error("Failed to collect file descriptors: %s", strerror(-k));
1300 fdset_close_others(fds);
1303 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1305 log_error("Failed to acquire pseudo tty: %m");
1309 console = ptsname(master);
1311 log_error("Failed to determine tty name: %m");
1316 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1318 if (unlockpt(master) < 0) {
1319 log_error("Failed to unlock tty: %m");
1324 if (access("/dev/kdbus/control", F_OK) >= 0) {
1326 if (arg_share_system) {
1327 kdbus_domain = strdup("/dev/kdbus");
1328 if (!kdbus_domain) {
1335 ns = strappenda("machine-", arg_machine);
1336 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1338 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1340 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1344 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1345 log_error("Failed to create kmsg socket pair: %m");
1349 sd_notify(0, "READY=1");
1351 assert_se(sigemptyset(&mask) == 0);
1352 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1353 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1358 sync_fd = eventfd(0, EFD_CLOEXEC);
1360 log_error("Failed to create event fd: %m");
1364 pid = syscall(__NR_clone,
1365 SIGCHLD|CLONE_NEWNS|
1366 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1367 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1369 if (errno == EINVAL)
1370 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1372 log_error("clone() failed: %m");
1379 const char *home = NULL;
1380 uid_t uid = (uid_t) -1;
1381 gid_t gid = (gid_t) -1;
1383 const char *envp[] = {
1384 "PATH=" DEFAULT_PATH_SPLIT_USR,
1385 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1390 NULL, /* container_uuid */
1391 NULL, /* LISTEN_FDS */
1392 NULL, /* LISTEN_PID */
1398 envp[n_env] = strv_find_prefix(environ, "TERM=");
1402 close_nointr_nofail(master);
1405 close_nointr(STDIN_FILENO);
1406 close_nointr(STDOUT_FILENO);
1407 close_nointr(STDERR_FILENO);
1409 close_nointr_nofail(kmsg_socket_pair[0]);
1410 kmsg_socket_pair[0] = -1;
1412 reset_all_signal_handlers();
1414 assert_se(sigemptyset(&mask) == 0);
1415 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1417 k = open_terminal(console, O_RDWR);
1418 if (k != STDIN_FILENO) {
1420 close_nointr_nofail(k);
1424 log_error("Failed to open console: %s", strerror(-k));
1428 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1429 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1430 log_error("Failed to duplicate console: %m");
1435 log_error("setsid() failed: %m");
1439 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1440 log_error("PR_SET_PDEATHSIG failed: %m");
1444 /* Mark everything as slave, so that we still
1445 * receive mounts from the real root, but don't
1446 * propagate mounts to the real root. */
1447 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1448 log_error("MS_SLAVE|MS_REC failed: %m");
1452 /* Turn directory into bind mount */
1453 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1454 log_error("Failed to make bind mount.");
1459 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1460 log_error("Failed to make read-only.");
1464 if (mount_all(arg_directory) < 0)
1467 if (copy_devnodes(arg_directory) < 0)
1470 if (setup_ptmx(arg_directory) < 0)
1473 dev_setup(arg_directory);
1475 if (setup_dev_console(arg_directory, console) < 0)
1478 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1481 close_nointr_nofail(kmsg_socket_pair[1]);
1482 kmsg_socket_pair[1] = -1;
1484 if (setup_boot_id(arg_directory) < 0)
1487 if (setup_timezone(arg_directory) < 0)
1490 if (setup_resolv_conf(arg_directory) < 0)
1493 if (setup_journal(arg_directory) < 0)
1496 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1499 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1502 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1505 if (chdir(arg_directory) < 0) {
1506 log_error("chdir(%s) failed: %m", arg_directory);
1510 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1511 log_error("mount(MS_MOVE) failed: %m");
1515 if (chroot(".") < 0) {
1516 log_error("chroot() failed: %m");
1520 if (chdir("/") < 0) {
1521 log_error("chdir() failed: %m");
1527 if (arg_private_network)
1530 if (drop_capabilities() < 0) {
1531 log_error("drop_capabilities() failed: %m");
1537 /* Note that this resolves user names
1538 * inside the container, and hence
1539 * accesses the NSS modules from the
1540 * container and not the host. This is
1543 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1544 log_error("get_user_creds() failed: %m");
1548 if (mkdir_parents_label(home, 0775) < 0) {
1549 log_error("mkdir_parents_label() failed: %m");
1553 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1554 log_error("mkdir_safe_label() failed: %m");
1558 if (initgroups((const char*)arg_user, gid) < 0) {
1559 log_error("initgroups() failed: %m");
1563 if (setresgid(gid, gid, gid) < 0) {
1564 log_error("setregid() failed: %m");
1568 if (setresuid(uid, uid, uid) < 0) {
1569 log_error("setreuid() failed: %m");
1573 /* Reset everything fully to 0, just in case */
1575 if (setgroups(0, NULL) < 0) {
1576 log_error("setgroups() failed: %m");
1580 if (setresgid(0, 0, 0) < 0) {
1581 log_error("setregid() failed: %m");
1585 if (setresuid(0, 0, 0) < 0) {
1586 log_error("setreuid() failed: %m");
1591 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1592 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1593 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1598 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1599 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1605 if (fdset_size(fds) > 0) {
1606 k = fdset_cloexec(fds, false);
1608 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1612 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1613 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1621 eventfd_read(sync_fd, &x);
1622 close_nointr_nofail(sync_fd);
1625 if (!strv_isempty(arg_setenv)) {
1628 n = strv_env_merge(2, envp, arg_setenv);
1636 env_use = (char**) envp;
1639 if (arg_selinux_context)
1640 if (setexeccon(arg_selinux_context) < 0)
1641 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1647 /* Automatically search for the init system */
1649 l = 1 + argc - optind;
1650 a = newa(char*, l + 1);
1651 memcpy(a + 1, argv + optind, l * sizeof(char*));
1653 a[0] = (char*) "/usr/lib/systemd/systemd";
1654 execve(a[0], a, env_use);
1656 a[0] = (char*) "/lib/systemd/systemd";
1657 execve(a[0], a, env_use);
1659 a[0] = (char*) "/sbin/init";
1660 execve(a[0], a, env_use);
1661 } else if (argc > optind)
1662 execvpe(argv[optind], argv + optind, env_use);
1664 chdir(home ? home : "/root");
1665 execle("/bin/bash", "-bash", NULL, env_use);
1668 log_error("execv() failed: %m");
1671 _exit(EXIT_FAILURE);
1677 r = register_machine(pid);
1681 eventfd_write(sync_fd, 1);
1682 close_nointr_nofail(sync_fd);
1685 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1694 /* Kill if it is not dead yet anyway */
1695 terminate_machine(pid);
1697 /* Redundant, but better safe than sorry */
1700 k = wait_for_terminate(pid, &status);
1708 if (status.si_code == CLD_EXITED) {
1709 r = status.si_status;
1710 if (status.si_status != 0) {
1711 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1716 log_debug("Container %s exited successfully.", arg_machine);
1718 } else if (status.si_code == CLD_KILLED &&
1719 status.si_status == SIGINT) {
1722 log_info("Container %s has been shut down.", arg_machine);
1725 } else if (status.si_code == CLD_KILLED &&
1726 status.si_status == SIGHUP) {
1729 log_info("Container %s is being rebooted.", arg_machine);
1731 } else if (status.si_code == CLD_KILLED ||
1732 status.si_code == CLD_DUMPED) {
1734 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1738 log_error("Container %s failed due to unknown reason.", arg_machine);
1748 free(arg_directory);