1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <selinux/selinux.h>
48 #include "sd-daemon.h"
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
66 #include "bus-error.h"
68 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
112 (1ULL << CAP_SYS_RESOURCE) |
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
115 (1ULL << CAP_AUDIT_CONTROL) |
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122 static bool arg_register = true;
123 static bool arg_keep_unit = false;
125 static int help(void) {
127 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
128 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
129 " -h --help Show this help\n"
130 " --version Print version string\n"
131 " -D --directory=NAME Root directory for the container\n"
132 " -b --boot Boot up full system (i.e. invoke init)\n"
133 " -u --user=USER Run the command under specified user or uid\n"
134 " --uuid=UUID Set a specific machine UUID for the container\n"
135 " -M --machine=NAME Set the machine name for the container\n"
136 " -S --slice=SLICE Place the container in the specified slice\n"
137 " -Z --selinux-context=SECLABEL\n"
138 " Set the SELinux security context to be used by\n"
139 " processes in the container\n"
140 " -L --selinux-apifs-context=SECLABEL\n"
141 " Set the SELinux security context to be used by\n"
142 " API/tmpfs file systems in the container\n"
143 " --private-network Disable network in container\n"
144 " --share-system Share system namespaces with host\n"
145 " --read-only Mount the root directory read-only\n"
146 " --capability=CAP In addition to the default, retain specified\n"
148 " --drop-capability=CAP Drop the specified capability from the default set\n"
149 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
150 " -j Equivalent to --link-journal=host\n"
151 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
153 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
154 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
155 " --register=BOOLEAN Register container as machine\n"
156 " --keep-unit Do not register a scope for the machine, reuse\n"
157 " the service unit nspawn is running in\n"
158 " -q --quiet Do not show status information\n",
159 program_invocation_short_name);
164 static int parse_argv(int argc, char *argv[]) {
182 static const struct option options[] = {
183 { "help", no_argument, NULL, 'h' },
184 { "version", no_argument, NULL, ARG_VERSION },
185 { "directory", required_argument, NULL, 'D' },
186 { "user", required_argument, NULL, 'u' },
187 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
188 { "boot", no_argument, NULL, 'b' },
189 { "uuid", required_argument, NULL, ARG_UUID },
190 { "read-only", no_argument, NULL, ARG_READ_ONLY },
191 { "capability", required_argument, NULL, ARG_CAPABILITY },
192 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
193 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
194 { "bind", required_argument, NULL, ARG_BIND },
195 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
196 { "machine", required_argument, NULL, 'M' },
197 { "slice", required_argument, NULL, 'S' },
198 { "setenv", required_argument, NULL, ARG_SETENV },
199 { "selinux-context", required_argument, NULL, 'Z' },
200 { "selinux-apifs-context", required_argument, NULL, 'L' },
201 { "quiet", no_argument, NULL, 'q' },
202 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
203 { "register", required_argument, NULL, ARG_REGISTER },
204 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
213 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
221 puts(PACKAGE_STRING);
222 puts(SYSTEMD_FEATURES);
227 arg_directory = canonicalize_file_name(optarg);
228 if (!arg_directory) {
229 log_error("Invalid root directory: %m");
237 arg_user = strdup(optarg);
243 case ARG_PRIVATE_NETWORK:
244 arg_private_network = true;
252 r = sd_id128_from_string(optarg, &arg_uuid);
254 log_error("Invalid UUID: %s", optarg);
260 arg_slice = strdup(optarg);
267 if (isempty(optarg)) {
272 if (!hostname_is_valid(optarg)) {
273 log_error("Invalid machine name: %s", optarg);
278 arg_machine = strdup(optarg);
286 arg_selinux_context = optarg;
290 arg_selinux_apifs_context = optarg;
294 arg_read_only = true;
298 case ARG_DROP_CAPABILITY: {
302 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
306 t = strndup(word, length);
310 if (cap_from_name(t, &cap) < 0) {
311 log_error("Failed to parse capability %s.", t);
318 if (c == ARG_CAPABILITY)
319 arg_retain |= 1ULL << (uint64_t) cap;
321 arg_retain &= ~(1ULL << (uint64_t) cap);
328 arg_link_journal = LINK_GUEST;
331 case ARG_LINK_JOURNAL:
332 if (streq(optarg, "auto"))
333 arg_link_journal = LINK_AUTO;
334 else if (streq(optarg, "no"))
335 arg_link_journal = LINK_NO;
336 else if (streq(optarg, "guest"))
337 arg_link_journal = LINK_GUEST;
338 else if (streq(optarg, "host"))
339 arg_link_journal = LINK_HOST;
341 log_error("Failed to parse link journal mode %s", optarg);
349 _cleanup_free_ char *a = NULL, *b = NULL;
353 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
355 e = strchr(optarg, ':');
357 a = strndup(optarg, e - optarg);
367 if (!path_is_absolute(a) || !path_is_absolute(b)) {
368 log_error("Invalid bind mount specification: %s", optarg);
372 r = strv_extend(x, a);
376 r = strv_extend(x, b);
386 if (!env_assignment_is_valid(optarg)) {
387 log_error("Environment variable assignment '%s' is not valid.", optarg);
391 n = strv_env_set(arg_setenv, optarg);
395 strv_free(arg_setenv);
404 case ARG_SHARE_SYSTEM:
405 arg_share_system = true;
409 r = parse_boolean(optarg);
411 log_error("Failed to parse --register= argument: %s", optarg);
419 arg_keep_unit = true;
426 assert_not_reached("Unhandled option");
430 if (arg_share_system)
431 arg_register = false;
433 if (arg_boot && arg_share_system) {
434 log_error("--boot and --share-system may not be combined.");
438 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
439 log_error("--keep-unit may not be used when invoked from a user session.");
446 static int mount_all(const char *dest) {
448 typedef struct MountPoint {
457 static const MountPoint mount_table[] = {
458 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
459 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
460 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
461 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
462 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
463 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
464 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
465 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
467 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
468 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
475 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
476 _cleanup_free_ char *where = NULL;
478 _cleanup_free_ char *options = NULL;
483 where = strjoin(dest, "/", mount_table[k].where, NULL);
487 t = path_is_mount_point(where, true);
489 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
497 /* Skip this entry if it is not a remount. */
498 if (mount_table[k].what && t > 0)
501 mkdir_p(where, 0755);
504 if (arg_selinux_apifs_context &&
505 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
506 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
513 o = mount_table[k].options;
516 if (mount(mount_table[k].what,
519 mount_table[k].flags,
521 mount_table[k].fatal) {
523 log_error("mount(%s) failed: %m", where);
533 static int mount_binds(const char *dest, char **l, unsigned long flags) {
536 STRV_FOREACH_PAIR(x, y, l) {
538 struct stat source_st, dest_st;
541 if (stat(*x, &source_st) < 0) {
542 log_error("failed to stat %s: %m", *x);
546 where = strappenda(dest, *y);
547 r = stat(where, &dest_st);
549 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
550 log_error("The file types of %s and %s do not match. Refusing bind mount",
554 } else if (errno == ENOENT) {
555 r = mkdir_parents_label(where, 0755);
557 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
561 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
564 /* Create the mount point, but be conservative -- refuse to create block
565 * and char devices. */
566 if (S_ISDIR(source_st.st_mode))
567 mkdir_label(where, 0755);
568 else if (S_ISFIFO(source_st.st_mode))
570 else if (S_ISSOCK(source_st.st_mode))
571 mknod(where, 0644 | S_IFSOCK, 0);
572 else if (S_ISREG(source_st.st_mode))
575 log_error("Refusing to create mountpoint for file: %s", *x);
579 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
580 log_error("mount(%s) failed: %m", where);
584 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
585 log_error("mount(%s) failed: %m", where);
593 static int setup_timezone(const char *dest) {
594 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
600 /* Fix the timezone, if possible */
601 r = readlink_malloc("/etc/localtime", &p);
603 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
607 z = path_startswith(p, "../usr/share/zoneinfo/");
609 z = path_startswith(p, "/usr/share/zoneinfo/");
611 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
615 where = strappend(dest, "/etc/localtime");
619 r = readlink_malloc(where, &q);
621 y = path_startswith(q, "../usr/share/zoneinfo/");
623 y = path_startswith(q, "/usr/share/zoneinfo/");
626 /* Already pointing to the right place? Then do nothing .. */
627 if (y && streq(y, z))
631 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
635 if (access(check, F_OK) < 0) {
636 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
640 what = strappend("../usr/share/zoneinfo/", z);
645 if (symlink(what, where) < 0) {
646 log_error("Failed to correct timezone of container: %m");
653 static int setup_resolv_conf(const char *dest) {
654 char _cleanup_free_ *where = NULL;
658 if (arg_private_network)
661 /* Fix resolv.conf, if possible */
662 where = strappend(dest, "/etc/resolv.conf");
666 /* We don't really care for the results of this really. If it
667 * fails, it fails, but meh... */
668 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
673 static int setup_boot_id(const char *dest) {
674 _cleanup_free_ char *from = NULL, *to = NULL;
681 if (arg_share_system)
684 /* Generate a new randomized boot ID, so that each boot-up of
685 * the container gets a new one */
687 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
688 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
692 r = sd_id128_randomize(&rnd);
694 log_error("Failed to generate random boot id: %s", strerror(-r));
698 snprintf(as_uuid, sizeof(as_uuid),
699 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
700 SD_ID128_FORMAT_VAL(rnd));
701 char_array_0(as_uuid);
703 r = write_string_file(from, as_uuid);
705 log_error("Failed to write boot id: %s", strerror(-r));
709 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
710 log_error("Failed to bind mount boot id: %m");
712 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
713 log_warning("Failed to make boot id read-only: %m");
719 static int copy_devnodes(const char *dest) {
721 static const char devnodes[] =
731 _cleanup_umask_ mode_t u;
737 NULSTR_FOREACH(d, devnodes) {
738 _cleanup_free_ char *from = NULL, *to = NULL;
741 from = strappend("/dev/", d);
742 to = strjoin(dest, "/dev/", d, NULL);
746 if (stat(from, &st) < 0) {
748 if (errno != ENOENT) {
749 log_error("Failed to stat %s: %m", from);
753 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
755 log_error("%s is not a char or block device, cannot copy", from);
758 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
760 log_error("mknod(%s) failed: %m", dest);
768 static int setup_ptmx(const char *dest) {
769 _cleanup_free_ char *p = NULL;
771 p = strappend(dest, "/dev/ptmx");
775 if (symlink("pts/ptmx", p) < 0) {
776 log_error("Failed to create /dev/ptmx symlink: %m");
783 static int setup_dev_console(const char *dest, const char *console) {
785 _cleanup_free_ char *to = NULL;
787 _cleanup_umask_ mode_t u;
794 if (stat(console, &st) < 0) {
795 log_error("Failed to stat %s: %m", console);
798 } else if (!S_ISCHR(st.st_mode)) {
799 log_error("/dev/console is not a char device");
803 r = chmod_and_chown(console, 0600, 0, 0);
805 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
809 if (asprintf(&to, "%s/dev/console", dest) < 0)
812 /* We need to bind mount the right tty to /dev/console since
813 * ptys can only exist on pts file systems. To have something
814 * to bind mount things on we create a device node first, that
815 * has the right major/minor (note that the major minor
816 * doesn't actually matter here, since we mount it over
819 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
820 log_error("mknod() for /dev/console failed: %m");
824 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
825 log_error("Bind mount for /dev/console failed: %m");
832 static int setup_kmsg(const char *dest, int kmsg_socket) {
833 _cleanup_free_ char *from = NULL, *to = NULL;
835 _cleanup_umask_ mode_t u;
837 struct cmsghdr cmsghdr;
838 uint8_t buf[CMSG_SPACE(sizeof(int))];
841 .msg_control = &control,
842 .msg_controllen = sizeof(control),
844 struct cmsghdr *cmsg;
847 assert(kmsg_socket >= 0);
851 /* We create the kmsg FIFO as /dev/kmsg, but immediately
852 * delete it after bind mounting it to /proc/kmsg. While FIFOs
853 * on the reading side behave very similar to /proc/kmsg,
854 * their writing side behaves differently from /dev/kmsg in
855 * that writing blocks when nothing is reading. In order to
856 * avoid any problems with containers deadlocking due to this
857 * we simply make /dev/kmsg unavailable to the container. */
858 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
859 asprintf(&to, "%s/proc/kmsg", dest) < 0)
862 if (mkfifo(from, 0600) < 0) {
863 log_error("mkfifo() for /dev/kmsg failed: %m");
867 r = chmod_and_chown(from, 0600, 0, 0);
869 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
873 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
874 log_error("Bind mount for /proc/kmsg failed: %m");
878 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
880 log_error("Failed to open fifo: %m");
884 cmsg = CMSG_FIRSTHDR(&mh);
885 cmsg->cmsg_level = SOL_SOCKET;
886 cmsg->cmsg_type = SCM_RIGHTS;
887 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
888 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
890 mh.msg_controllen = cmsg->cmsg_len;
892 /* Store away the fd in the socket, so that it stays open as
893 * long as we run the child */
894 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
895 close_nointr_nofail(fd);
898 log_error("Failed to send FIFO fd: %m");
902 /* And now make the FIFO unavailable as /dev/kmsg... */
907 static int setup_hostname(void) {
909 if (arg_share_system)
912 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
918 static int setup_journal(const char *directory) {
919 sd_id128_t machine_id, this_id;
920 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
924 p = strappend(directory, "/etc/machine-id");
928 r = read_one_line_file(p, &b);
929 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
932 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
937 if (isempty(id) && arg_link_journal == LINK_AUTO)
940 /* Verify validity */
941 r = sd_id128_from_string(id, &machine_id);
943 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
947 r = sd_id128_get_machine(&this_id);
949 log_error("Failed to retrieve machine ID: %s", strerror(-r));
953 if (sd_id128_equal(machine_id, this_id)) {
954 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
955 "Host and machine ids are equal (%s): refusing to link journals", id);
956 if (arg_link_journal == LINK_AUTO)
962 if (arg_link_journal == LINK_NO)
966 p = strappend("/var/log/journal/", id);
967 q = strjoin(directory, "/var/log/journal/", id, NULL);
971 if (path_is_mount_point(p, false) > 0) {
972 if (arg_link_journal != LINK_AUTO) {
973 log_error("%s: already a mount point, refusing to use for journal", p);
980 if (path_is_mount_point(q, false) > 0) {
981 if (arg_link_journal != LINK_AUTO) {
982 log_error("%s: already a mount point, refusing to use for journal", q);
989 r = readlink_and_make_absolute(p, &d);
991 if ((arg_link_journal == LINK_GUEST ||
992 arg_link_journal == LINK_AUTO) &&
995 r = mkdir_p(q, 0755);
997 log_warning("failed to create directory %s: %m", q);
1001 if (unlink(p) < 0) {
1002 log_error("Failed to remove symlink %s: %m", p);
1005 } else if (r == -EINVAL) {
1007 if (arg_link_journal == LINK_GUEST &&
1010 if (errno == ENOTDIR) {
1011 log_error("%s already exists and is neither a symlink nor a directory", p);
1014 log_error("Failed to remove %s: %m", p);
1018 } else if (r != -ENOENT) {
1019 log_error("readlink(%s) failed: %m", p);
1023 if (arg_link_journal == LINK_GUEST) {
1025 if (symlink(q, p) < 0) {
1026 log_error("Failed to symlink %s to %s: %m", q, p);
1030 r = mkdir_p(q, 0755);
1032 log_warning("failed to create directory %s: %m", q);
1036 if (arg_link_journal == LINK_HOST) {
1037 r = mkdir_p(p, 0755);
1039 log_error("Failed to create %s: %m", p);
1043 } else if (access(p, F_OK) < 0)
1046 if (dir_is_empty(q) == 0) {
1047 log_error("%s not empty.", q);
1051 r = mkdir_p(q, 0755);
1053 log_error("Failed to create %s: %m", q);
1057 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1058 log_error("Failed to bind mount journal from host into guest: %m");
1065 static int setup_kdbus(const char *dest, const char *path) {
1071 p = strappenda(dest, "/dev/kdbus");
1072 if (mkdir(p, 0755) < 0) {
1073 log_error("Failed to create kdbus path: %m");
1077 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1078 log_error("Failed to mount kdbus domain path: %m");
1085 static int drop_capabilities(void) {
1086 return capability_bounding_set_drop(~arg_retain, false);
1089 static int register_machine(pid_t pid) {
1090 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1091 _cleanup_bus_unref_ sd_bus *bus = NULL;
1097 r = sd_bus_default_system(&bus);
1099 log_error("Failed to open system bus: %s", strerror(-r));
1103 if (arg_keep_unit) {
1104 r = sd_bus_call_method(
1106 "org.freedesktop.machine1",
1107 "/org/freedesktop/machine1",
1108 "org.freedesktop.machine1.Manager",
1114 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1118 strempty(arg_directory));
1120 r = sd_bus_call_method(
1122 "org.freedesktop.machine1",
1123 "/org/freedesktop/machine1",
1124 "org.freedesktop.machine1.Manager",
1130 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1134 strempty(arg_directory),
1135 !isempty(arg_slice), "Slice", "s", arg_slice);
1139 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1146 static int terminate_machine(pid_t pid) {
1147 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1148 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1149 _cleanup_bus_unref_ sd_bus *bus = NULL;
1156 r = sd_bus_default_system(&bus);
1158 log_error("Failed to open system bus: %s", strerror(-r));
1162 r = sd_bus_call_method(
1164 "org.freedesktop.machine1",
1165 "/org/freedesktop/machine1",
1166 "org.freedesktop.machine1.Manager",
1173 /* Note that the machine might already have been
1174 * cleaned up automatically, hence don't consider it a
1175 * failure if we cannot get the machine object. */
1176 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1180 r = sd_bus_message_read(reply, "o", &path);
1182 return bus_log_parse_error(r);
1184 r = sd_bus_call_method(
1186 "org.freedesktop.machine1",
1188 "org.freedesktop.machine1.Machine",
1194 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1201 static int reset_audit_loginuid(void) {
1202 _cleanup_free_ char *p = NULL;
1205 if (arg_share_system)
1208 r = read_one_line_file("/proc/self/loginuid", &p);
1212 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1216 /* Already reset? */
1217 if (streq(p, "4294967295"))
1220 r = write_string_file("/proc/self/loginuid", "4294967295");
1222 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1223 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1224 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1225 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1226 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1234 int main(int argc, char *argv[]) {
1236 int r = EXIT_FAILURE, k;
1237 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1239 const char *console = NULL;
1241 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1242 _cleanup_fdset_free_ FDSet *fds = NULL;
1243 _cleanup_free_ char *kdbus_domain = NULL;
1245 log_parse_environment();
1248 k = parse_argv(argc, argv);
1256 if (arg_directory) {
1259 p = path_make_absolute_cwd(arg_directory);
1260 free(arg_directory);
1263 arg_directory = get_current_dir_name();
1265 if (!arg_directory) {
1266 log_error("Failed to determine path, please use -D.");
1270 path_kill_slashes(arg_directory);
1273 arg_machine = strdup(basename(arg_directory));
1279 hostname_cleanup(arg_machine, false);
1280 if (isempty(arg_machine)) {
1281 log_error("Failed to determine machine name automatically, please use -M.");
1286 if (geteuid() != 0) {
1287 log_error("Need to be root.");
1291 if (sd_booted() <= 0) {
1292 log_error("Not running on a systemd system.");
1296 if (path_equal(arg_directory, "/")) {
1297 log_error("Spawning container on root directory not supported.");
1301 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1302 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1307 n_fd_passed = sd_listen_fds(false);
1308 if (n_fd_passed > 0) {
1309 k = fdset_new_listen_fds(&fds, false);
1311 log_error("Failed to collect file descriptors: %s", strerror(-k));
1315 fdset_close_others(fds);
1318 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1320 log_error("Failed to acquire pseudo tty: %m");
1324 console = ptsname(master);
1326 log_error("Failed to determine tty name: %m");
1331 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1333 if (unlockpt(master) < 0) {
1334 log_error("Failed to unlock tty: %m");
1339 if (access("/dev/kdbus/control", F_OK) >= 0) {
1341 if (arg_share_system) {
1342 kdbus_domain = strdup("/dev/kdbus");
1343 if (!kdbus_domain) {
1350 ns = strappenda("machine-", arg_machine);
1351 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1353 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1355 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1359 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1360 log_error("Failed to create kmsg socket pair: %m");
1364 sd_notify(0, "READY=1");
1366 assert_se(sigemptyset(&mask) == 0);
1367 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1368 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1373 sync_fd = eventfd(0, EFD_CLOEXEC);
1375 log_error("Failed to create event fd: %m");
1379 pid = syscall(__NR_clone,
1380 SIGCHLD|CLONE_NEWNS|
1381 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1382 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1384 if (errno == EINVAL)
1385 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1387 log_error("clone() failed: %m");
1394 const char *home = NULL;
1395 uid_t uid = (uid_t) -1;
1396 gid_t gid = (gid_t) -1;
1398 const char *envp[] = {
1399 "PATH=" DEFAULT_PATH_SPLIT_USR,
1400 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1405 NULL, /* container_uuid */
1406 NULL, /* LISTEN_FDS */
1407 NULL, /* LISTEN_PID */
1413 envp[n_env] = strv_find_prefix(environ, "TERM=");
1417 close_nointr_nofail(master);
1420 close_nointr(STDIN_FILENO);
1421 close_nointr(STDOUT_FILENO);
1422 close_nointr(STDERR_FILENO);
1424 close_nointr_nofail(kmsg_socket_pair[0]);
1425 kmsg_socket_pair[0] = -1;
1427 reset_all_signal_handlers();
1429 assert_se(sigemptyset(&mask) == 0);
1430 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1432 k = open_terminal(console, O_RDWR);
1433 if (k != STDIN_FILENO) {
1435 close_nointr_nofail(k);
1439 log_error("Failed to open console: %s", strerror(-k));
1443 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1444 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1445 log_error("Failed to duplicate console: %m");
1450 log_error("setsid() failed: %m");
1454 if (reset_audit_loginuid() < 0)
1457 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1458 log_error("PR_SET_PDEATHSIG failed: %m");
1462 /* Mark everything as slave, so that we still
1463 * receive mounts from the real root, but don't
1464 * propagate mounts to the real root. */
1465 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1466 log_error("MS_SLAVE|MS_REC failed: %m");
1470 /* Turn directory into bind mount */
1471 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1472 log_error("Failed to make bind mount.");
1477 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1478 log_error("Failed to make read-only.");
1482 if (mount_all(arg_directory) < 0)
1485 if (copy_devnodes(arg_directory) < 0)
1488 if (setup_ptmx(arg_directory) < 0)
1491 dev_setup(arg_directory);
1493 if (setup_dev_console(arg_directory, console) < 0)
1496 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1499 close_nointr_nofail(kmsg_socket_pair[1]);
1500 kmsg_socket_pair[1] = -1;
1502 if (setup_boot_id(arg_directory) < 0)
1505 if (setup_timezone(arg_directory) < 0)
1508 if (setup_resolv_conf(arg_directory) < 0)
1511 if (setup_journal(arg_directory) < 0)
1514 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1517 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1520 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1523 if (chdir(arg_directory) < 0) {
1524 log_error("chdir(%s) failed: %m", arg_directory);
1528 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1529 log_error("mount(MS_MOVE) failed: %m");
1533 if (chroot(".") < 0) {
1534 log_error("chroot() failed: %m");
1538 if (chdir("/") < 0) {
1539 log_error("chdir() failed: %m");
1545 if (arg_private_network)
1548 if (drop_capabilities() < 0) {
1549 log_error("drop_capabilities() failed: %m");
1555 /* Note that this resolves user names
1556 * inside the container, and hence
1557 * accesses the NSS modules from the
1558 * container and not the host. This is
1561 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1562 log_error("get_user_creds() failed: %m");
1566 if (mkdir_parents_label(home, 0775) < 0) {
1567 log_error("mkdir_parents_label() failed: %m");
1571 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1572 log_error("mkdir_safe_label() failed: %m");
1576 if (initgroups((const char*)arg_user, gid) < 0) {
1577 log_error("initgroups() failed: %m");
1581 if (setresgid(gid, gid, gid) < 0) {
1582 log_error("setregid() failed: %m");
1586 if (setresuid(uid, uid, uid) < 0) {
1587 log_error("setreuid() failed: %m");
1591 /* Reset everything fully to 0, just in case */
1593 if (setgroups(0, NULL) < 0) {
1594 log_error("setgroups() failed: %m");
1598 if (setresgid(0, 0, 0) < 0) {
1599 log_error("setregid() failed: %m");
1603 if (setresuid(0, 0, 0) < 0) {
1604 log_error("setreuid() failed: %m");
1609 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1610 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1611 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1616 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1617 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1623 if (fdset_size(fds) > 0) {
1624 k = fdset_cloexec(fds, false);
1626 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1630 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1631 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1639 eventfd_read(sync_fd, &x);
1640 close_nointr_nofail(sync_fd);
1643 if (!strv_isempty(arg_setenv)) {
1646 n = strv_env_merge(2, envp, arg_setenv);
1654 env_use = (char**) envp;
1657 if (arg_selinux_context)
1658 if (setexeccon(arg_selinux_context) < 0)
1659 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1665 /* Automatically search for the init system */
1667 l = 1 + argc - optind;
1668 a = newa(char*, l + 1);
1669 memcpy(a + 1, argv + optind, l * sizeof(char*));
1671 a[0] = (char*) "/usr/lib/systemd/systemd";
1672 execve(a[0], a, env_use);
1674 a[0] = (char*) "/lib/systemd/systemd";
1675 execve(a[0], a, env_use);
1677 a[0] = (char*) "/sbin/init";
1678 execve(a[0], a, env_use);
1679 } else if (argc > optind)
1680 execvpe(argv[optind], argv + optind, env_use);
1682 chdir(home ? home : "/root");
1683 execle("/bin/bash", "-bash", NULL, env_use);
1686 log_error("execv() failed: %m");
1689 _exit(EXIT_FAILURE);
1695 r = register_machine(pid);
1699 eventfd_write(sync_fd, 1);
1700 close_nointr_nofail(sync_fd);
1703 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1712 /* Kill if it is not dead yet anyway */
1713 terminate_machine(pid);
1715 /* Redundant, but better safe than sorry */
1718 k = wait_for_terminate(pid, &status);
1726 if (status.si_code == CLD_EXITED) {
1727 r = status.si_status;
1728 if (status.si_status != 0) {
1729 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1734 log_debug("Container %s exited successfully.", arg_machine);
1736 } else if (status.si_code == CLD_KILLED &&
1737 status.si_status == SIGINT) {
1740 log_info("Container %s has been shut down.", arg_machine);
1743 } else if (status.si_code == CLD_KILLED &&
1744 status.si_status == SIGHUP) {
1747 log_info("Container %s is being rebooted.", arg_machine);
1749 } else if (status.si_code == CLD_KILLED ||
1750 status.si_code == CLD_DUMPED) {
1752 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1756 log_error("Container %s failed due to unknown reason.", arg_machine);
1766 free(arg_directory);