1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <selinux/selinux.h>
48 #include "sd-daemon.h"
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
66 #include "bus-error.h"
68 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
112 (1ULL << CAP_SYS_RESOURCE) |
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
115 (1ULL << CAP_AUDIT_CONTROL) |
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122 static bool arg_register = true;
124 static int help(void) {
126 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
127 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
128 " -h --help Show this help\n"
129 " --version Print version string\n"
130 " -D --directory=NAME Root directory for the container\n"
131 " -b --boot Boot up full system (i.e. invoke init)\n"
132 " -u --user=USER Run the command under specified user or uid\n"
133 " --uuid=UUID Set a specific machine UUID for the container\n"
134 " -M --machine=NAME Set the machine name for the container\n"
135 " -S --slice=SLICE Place the container in the specified slice\n"
136 " -Z --selinux-context=SECLABEL\n"
137 " Set the SELinux security context to be used by\n"
138 " processes in the container\n"
139 " -L --selinux-apifs-context=SECLABEL\n"
140 " Set the SELinux security context to be used by\n"
141 " API/tmpfs file systems in the container\n"
142 " --private-network Disable network in container\n"
143 " --share-system Share system namespaces with host\n"
144 " --read-only Mount the root directory read-only\n"
145 " --capability=CAP In addition to the default, retain specified\n"
147 " --drop-capability=CAP Drop the specified capability from the default set\n"
148 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
149 " -j Equivalent to --link-journal=host\n"
150 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
152 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
153 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
154 " --register=BOOLEAN Register container as machine\n"
155 " -q --quiet Do not show status information\n",
156 program_invocation_short_name);
161 static int parse_argv(int argc, char *argv[]) {
178 static const struct option options[] = {
179 { "help", no_argument, NULL, 'h' },
180 { "version", no_argument, NULL, ARG_VERSION },
181 { "directory", required_argument, NULL, 'D' },
182 { "user", required_argument, NULL, 'u' },
183 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
184 { "boot", no_argument, NULL, 'b' },
185 { "uuid", required_argument, NULL, ARG_UUID },
186 { "read-only", no_argument, NULL, ARG_READ_ONLY },
187 { "capability", required_argument, NULL, ARG_CAPABILITY },
188 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
189 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
190 { "bind", required_argument, NULL, ARG_BIND },
191 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
192 { "machine", required_argument, NULL, 'M' },
193 { "slice", required_argument, NULL, 'S' },
194 { "setenv", required_argument, NULL, ARG_SETENV },
195 { "selinux-context", required_argument, NULL, 'Z' },
196 { "selinux-apifs-context", required_argument, NULL, 'L' },
197 { "quiet", no_argument, NULL, 'q' },
198 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
199 { "register", required_argument, NULL, ARG_REGISTER },
208 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
216 puts(PACKAGE_STRING);
217 puts(SYSTEMD_FEATURES);
222 arg_directory = canonicalize_file_name(optarg);
223 if (!arg_directory) {
224 log_error("Invalid root directory: %m");
232 arg_user = strdup(optarg);
238 case ARG_PRIVATE_NETWORK:
239 arg_private_network = true;
247 r = sd_id128_from_string(optarg, &arg_uuid);
249 log_error("Invalid UUID: %s", optarg);
255 arg_slice = strdup(optarg);
262 if (isempty(optarg)) {
267 if (!hostname_is_valid(optarg)) {
268 log_error("Invalid machine name: %s", optarg);
273 arg_machine = strdup(optarg);
281 arg_selinux_context = optarg;
285 arg_selinux_apifs_context = optarg;
289 arg_read_only = true;
293 case ARG_DROP_CAPABILITY: {
297 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
301 t = strndup(word, length);
305 if (cap_from_name(t, &cap) < 0) {
306 log_error("Failed to parse capability %s.", t);
313 if (c == ARG_CAPABILITY)
314 arg_retain |= 1ULL << (uint64_t) cap;
316 arg_retain &= ~(1ULL << (uint64_t) cap);
323 arg_link_journal = LINK_GUEST;
326 case ARG_LINK_JOURNAL:
327 if (streq(optarg, "auto"))
328 arg_link_journal = LINK_AUTO;
329 else if (streq(optarg, "no"))
330 arg_link_journal = LINK_NO;
331 else if (streq(optarg, "guest"))
332 arg_link_journal = LINK_GUEST;
333 else if (streq(optarg, "host"))
334 arg_link_journal = LINK_HOST;
336 log_error("Failed to parse link journal mode %s", optarg);
344 _cleanup_free_ char *a = NULL, *b = NULL;
348 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
350 e = strchr(optarg, ':');
352 a = strndup(optarg, e - optarg);
362 if (!path_is_absolute(a) || !path_is_absolute(b)) {
363 log_error("Invalid bind mount specification: %s", optarg);
367 r = strv_extend(x, a);
371 r = strv_extend(x, b);
381 if (!env_assignment_is_valid(optarg)) {
382 log_error("Environment variable assignment '%s' is not valid.", optarg);
386 n = strv_env_set(arg_setenv, optarg);
390 strv_free(arg_setenv);
399 case ARG_SHARE_SYSTEM:
400 arg_share_system = true;
404 r = parse_boolean(optarg);
406 log_error("Failed to parse --register= argument: %s", optarg);
417 assert_not_reached("Unhandled option");
421 if (arg_share_system)
422 arg_register = false;
424 if (arg_boot && arg_share_system) {
425 log_error("--boot and --share-system may not be combined.");
432 static int mount_all(const char *dest) {
434 typedef struct MountPoint {
443 static const MountPoint mount_table[] = {
444 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
445 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
446 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
447 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
448 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
449 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
450 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
451 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
453 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
454 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
461 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
462 _cleanup_free_ char *where = NULL;
464 _cleanup_free_ char *options = NULL;
469 where = strjoin(dest, "/", mount_table[k].where, NULL);
473 t = path_is_mount_point(where, true);
475 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
483 /* Skip this entry if it is not a remount. */
484 if (mount_table[k].what && t > 0)
487 mkdir_p(where, 0755);
490 if (arg_selinux_apifs_context &&
491 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
492 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
499 o = mount_table[k].options;
502 if (mount(mount_table[k].what,
505 mount_table[k].flags,
507 mount_table[k].fatal) {
509 log_error("mount(%s) failed: %m", where);
519 static int mount_binds(const char *dest, char **l, unsigned long flags) {
522 STRV_FOREACH_PAIR(x, y, l) {
524 struct stat source_st, dest_st;
527 if (stat(*x, &source_st) < 0) {
528 log_error("failed to stat %s: %m", *x);
532 where = strappenda(dest, *y);
533 r = stat(where, &dest_st);
535 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
536 log_error("The file types of %s and %s do not match. Refusing bind mount",
540 } else if (errno == ENOENT) {
541 r = mkdir_parents_label(where, 0755);
543 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
547 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
550 /* Create the mount point, but be conservative -- refuse to create block
551 * and char devices. */
552 if (S_ISDIR(source_st.st_mode))
553 mkdir_label(where, 0755);
554 else if (S_ISFIFO(source_st.st_mode))
556 else if (S_ISSOCK(source_st.st_mode))
557 mknod(where, 0644 | S_IFSOCK, 0);
558 else if (S_ISREG(source_st.st_mode))
561 log_error("Refusing to create mountpoint for file: %s", *x);
565 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
566 log_error("mount(%s) failed: %m", where);
570 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
571 log_error("mount(%s) failed: %m", where);
579 static int setup_timezone(const char *dest) {
580 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
586 /* Fix the timezone, if possible */
587 r = readlink_malloc("/etc/localtime", &p);
589 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
593 z = path_startswith(p, "../usr/share/zoneinfo/");
595 z = path_startswith(p, "/usr/share/zoneinfo/");
597 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
601 where = strappend(dest, "/etc/localtime");
605 r = readlink_malloc(where, &q);
607 y = path_startswith(q, "../usr/share/zoneinfo/");
609 y = path_startswith(q, "/usr/share/zoneinfo/");
612 /* Already pointing to the right place? Then do nothing .. */
613 if (y && streq(y, z))
617 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
621 if (access(check, F_OK) < 0) {
622 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
626 what = strappend("../usr/share/zoneinfo/", z);
631 if (symlink(what, where) < 0) {
632 log_error("Failed to correct timezone of container: %m");
639 static int setup_resolv_conf(const char *dest) {
640 char _cleanup_free_ *where = NULL;
644 if (arg_private_network)
647 /* Fix resolv.conf, if possible */
648 where = strappend(dest, "/etc/resolv.conf");
652 /* We don't really care for the results of this really. If it
653 * fails, it fails, but meh... */
654 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
659 static int setup_boot_id(const char *dest) {
660 _cleanup_free_ char *from = NULL, *to = NULL;
667 if (arg_share_system)
670 /* Generate a new randomized boot ID, so that each boot-up of
671 * the container gets a new one */
673 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
674 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
678 r = sd_id128_randomize(&rnd);
680 log_error("Failed to generate random boot id: %s", strerror(-r));
684 snprintf(as_uuid, sizeof(as_uuid),
685 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
686 SD_ID128_FORMAT_VAL(rnd));
687 char_array_0(as_uuid);
689 r = write_string_file(from, as_uuid);
691 log_error("Failed to write boot id: %s", strerror(-r));
695 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
696 log_error("Failed to bind mount boot id: %m");
698 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
699 log_warning("Failed to make boot id read-only: %m");
705 static int copy_devnodes(const char *dest) {
707 static const char devnodes[] =
717 _cleanup_umask_ mode_t u;
723 NULSTR_FOREACH(d, devnodes) {
724 _cleanup_free_ char *from = NULL, *to = NULL;
727 from = strappend("/dev/", d);
728 to = strjoin(dest, "/dev/", d, NULL);
732 if (stat(from, &st) < 0) {
734 if (errno != ENOENT) {
735 log_error("Failed to stat %s: %m", from);
739 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
741 log_error("%s is not a char or block device, cannot copy", from);
744 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
746 log_error("mknod(%s) failed: %m", dest);
754 static int setup_ptmx(const char *dest) {
755 _cleanup_free_ char *p = NULL;
757 p = strappend(dest, "/dev/ptmx");
761 if (symlink("pts/ptmx", p) < 0) {
762 log_error("Failed to create /dev/ptmx symlink: %m");
769 static int setup_dev_console(const char *dest, const char *console) {
771 _cleanup_free_ char *to = NULL;
773 _cleanup_umask_ mode_t u;
780 if (stat(console, &st) < 0) {
781 log_error("Failed to stat %s: %m", console);
784 } else if (!S_ISCHR(st.st_mode)) {
785 log_error("/dev/console is not a char device");
789 r = chmod_and_chown(console, 0600, 0, 0);
791 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
795 if (asprintf(&to, "%s/dev/console", dest) < 0)
798 /* We need to bind mount the right tty to /dev/console since
799 * ptys can only exist on pts file systems. To have something
800 * to bind mount things on we create a device node first, that
801 * has the right major/minor (note that the major minor
802 * doesn't actually matter here, since we mount it over
805 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
806 log_error("mknod() for /dev/console failed: %m");
810 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
811 log_error("Bind mount for /dev/console failed: %m");
818 static int setup_kmsg(const char *dest, int kmsg_socket) {
819 _cleanup_free_ char *from = NULL, *to = NULL;
821 _cleanup_umask_ mode_t u;
823 struct cmsghdr cmsghdr;
824 uint8_t buf[CMSG_SPACE(sizeof(int))];
827 .msg_control = &control,
828 .msg_controllen = sizeof(control),
830 struct cmsghdr *cmsg;
833 assert(kmsg_socket >= 0);
837 /* We create the kmsg FIFO as /dev/kmsg, but immediately
838 * delete it after bind mounting it to /proc/kmsg. While FIFOs
839 * on the reading side behave very similar to /proc/kmsg,
840 * their writing side behaves differently from /dev/kmsg in
841 * that writing blocks when nothing is reading. In order to
842 * avoid any problems with containers deadlocking due to this
843 * we simply make /dev/kmsg unavailable to the container. */
844 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
845 asprintf(&to, "%s/proc/kmsg", dest) < 0)
848 if (mkfifo(from, 0600) < 0) {
849 log_error("mkfifo() for /dev/kmsg failed: %m");
853 r = chmod_and_chown(from, 0600, 0, 0);
855 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
859 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
860 log_error("Bind mount for /proc/kmsg failed: %m");
864 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
866 log_error("Failed to open fifo: %m");
870 cmsg = CMSG_FIRSTHDR(&mh);
871 cmsg->cmsg_level = SOL_SOCKET;
872 cmsg->cmsg_type = SCM_RIGHTS;
873 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
874 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
876 mh.msg_controllen = cmsg->cmsg_len;
878 /* Store away the fd in the socket, so that it stays open as
879 * long as we run the child */
880 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
881 close_nointr_nofail(fd);
884 log_error("Failed to send FIFO fd: %m");
888 /* And now make the FIFO unavailable as /dev/kmsg... */
893 static int setup_hostname(void) {
895 if (arg_share_system)
898 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
904 static int setup_journal(const char *directory) {
905 sd_id128_t machine_id, this_id;
906 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
910 p = strappend(directory, "/etc/machine-id");
914 r = read_one_line_file(p, &b);
915 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
918 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
923 if (isempty(id) && arg_link_journal == LINK_AUTO)
926 /* Verify validity */
927 r = sd_id128_from_string(id, &machine_id);
929 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
933 r = sd_id128_get_machine(&this_id);
935 log_error("Failed to retrieve machine ID: %s", strerror(-r));
939 if (sd_id128_equal(machine_id, this_id)) {
940 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
941 "Host and machine ids are equal (%s): refusing to link journals", id);
942 if (arg_link_journal == LINK_AUTO)
948 if (arg_link_journal == LINK_NO)
952 p = strappend("/var/log/journal/", id);
953 q = strjoin(directory, "/var/log/journal/", id, NULL);
957 if (path_is_mount_point(p, false) > 0) {
958 if (arg_link_journal != LINK_AUTO) {
959 log_error("%s: already a mount point, refusing to use for journal", p);
966 if (path_is_mount_point(q, false) > 0) {
967 if (arg_link_journal != LINK_AUTO) {
968 log_error("%s: already a mount point, refusing to use for journal", q);
975 r = readlink_and_make_absolute(p, &d);
977 if ((arg_link_journal == LINK_GUEST ||
978 arg_link_journal == LINK_AUTO) &&
981 r = mkdir_p(q, 0755);
983 log_warning("failed to create directory %s: %m", q);
988 log_error("Failed to remove symlink %s: %m", p);
991 } else if (r == -EINVAL) {
993 if (arg_link_journal == LINK_GUEST &&
996 if (errno == ENOTDIR) {
997 log_error("%s already exists and is neither a symlink nor a directory", p);
1000 log_error("Failed to remove %s: %m", p);
1004 } else if (r != -ENOENT) {
1005 log_error("readlink(%s) failed: %m", p);
1009 if (arg_link_journal == LINK_GUEST) {
1011 if (symlink(q, p) < 0) {
1012 log_error("Failed to symlink %s to %s: %m", q, p);
1016 r = mkdir_p(q, 0755);
1018 log_warning("failed to create directory %s: %m", q);
1022 if (arg_link_journal == LINK_HOST) {
1023 r = mkdir_p(p, 0755);
1025 log_error("Failed to create %s: %m", p);
1029 } else if (access(p, F_OK) < 0)
1032 if (dir_is_empty(q) == 0) {
1033 log_error("%s not empty.", q);
1037 r = mkdir_p(q, 0755);
1039 log_error("Failed to create %s: %m", q);
1043 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1044 log_error("Failed to bind mount journal from host into guest: %m");
1051 static int setup_kdbus(const char *dest, const char *path) {
1057 p = strappenda(dest, "/dev/kdbus");
1058 if (mkdir(p, 0755) < 0) {
1059 log_error("Failed to create kdbus path: %m");
1063 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1064 log_error("Failed to mount kdbus domain path: %m");
1071 static int drop_capabilities(void) {
1072 return capability_bounding_set_drop(~arg_retain, false);
1075 static int register_machine(pid_t pid) {
1076 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1077 _cleanup_bus_unref_ sd_bus *bus = NULL;
1083 r = sd_bus_default_system(&bus);
1085 log_error("Failed to open system bus: %s", strerror(-r));
1089 r = sd_bus_call_method(
1091 "org.freedesktop.machine1",
1092 "/org/freedesktop/machine1",
1093 "org.freedesktop.machine1.Manager",
1099 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1103 strempty(arg_directory),
1104 !isempty(arg_slice), "Slice", "s", arg_slice);
1106 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1113 static int terminate_machine(pid_t pid) {
1114 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1115 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1116 _cleanup_bus_unref_ sd_bus *bus = NULL;
1123 r = sd_bus_default_system(&bus);
1125 log_error("Failed to open system bus: %s", strerror(-r));
1129 r = sd_bus_call_method(
1131 "org.freedesktop.machine1",
1132 "/org/freedesktop/machine1",
1133 "org.freedesktop.machine1.Manager",
1140 /* Note that the machine might already have been
1141 * cleaned up automatically, hence don't consider it a
1142 * failure if we cannot get the machine object. */
1143 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1147 r = sd_bus_message_read(reply, "o", &path);
1149 return bus_log_parse_error(r);
1151 r = sd_bus_call_method(
1153 "org.freedesktop.machine1",
1155 "org.freedesktop.machine1.Machine",
1161 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1168 static bool audit_enabled(void) {
1171 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1173 close_nointr_nofail(fd);
1179 int main(int argc, char *argv[]) {
1181 int r = EXIT_FAILURE, k;
1182 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1184 const char *console = NULL;
1186 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1187 _cleanup_fdset_free_ FDSet *fds = NULL;
1188 _cleanup_free_ char *kdbus_domain = NULL;
1190 log_parse_environment();
1193 k = parse_argv(argc, argv);
1201 if (arg_directory) {
1204 p = path_make_absolute_cwd(arg_directory);
1205 free(arg_directory);
1208 arg_directory = get_current_dir_name();
1210 if (!arg_directory) {
1211 log_error("Failed to determine path, please use -D.");
1215 path_kill_slashes(arg_directory);
1218 arg_machine = strdup(basename(arg_directory));
1224 hostname_cleanup(arg_machine, false);
1225 if (isempty(arg_machine)) {
1226 log_error("Failed to determine machine name automatically, please use -M.");
1231 if (geteuid() != 0) {
1232 log_error("Need to be root.");
1236 if (sd_booted() <= 0) {
1237 log_error("Not running on a systemd system.");
1241 if (arg_boot && audit_enabled()) {
1242 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1243 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1244 "line before using systemd-nspawn. Sleeping for 5s...\n");
1248 if (path_equal(arg_directory, "/")) {
1249 log_error("Spawning container on root directory not supported.");
1253 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1254 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1259 n_fd_passed = sd_listen_fds(false);
1260 if (n_fd_passed > 0) {
1261 k = fdset_new_listen_fds(&fds, false);
1263 log_error("Failed to collect file descriptors: %s", strerror(-k));
1267 fdset_close_others(fds);
1270 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1272 log_error("Failed to acquire pseudo tty: %m");
1276 console = ptsname(master);
1278 log_error("Failed to determine tty name: %m");
1283 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1285 if (unlockpt(master) < 0) {
1286 log_error("Failed to unlock tty: %m");
1291 if (access("/dev/kdbus/control", F_OK) >= 0) {
1293 if (arg_share_system) {
1294 kdbus_domain = strdup("/dev/kdbus");
1295 if (!kdbus_domain) {
1302 ns = strappenda("machine-", arg_machine);
1303 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1305 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1307 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1311 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1312 log_error("Failed to create kmsg socket pair: %m");
1316 sd_notify(0, "READY=1");
1318 assert_se(sigemptyset(&mask) == 0);
1319 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1320 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1325 sync_fd = eventfd(0, EFD_CLOEXEC);
1327 log_error("Failed to create event fd: %m");
1331 pid = syscall(__NR_clone,
1332 SIGCHLD|CLONE_NEWNS|
1333 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1334 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1336 if (errno == EINVAL)
1337 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1339 log_error("clone() failed: %m");
1346 const char *home = NULL;
1347 uid_t uid = (uid_t) -1;
1348 gid_t gid = (gid_t) -1;
1350 const char *envp[] = {
1351 "PATH=" DEFAULT_PATH_SPLIT_USR,
1352 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1357 NULL, /* container_uuid */
1358 NULL, /* LISTEN_FDS */
1359 NULL, /* LISTEN_PID */
1365 envp[n_env] = strv_find_prefix(environ, "TERM=");
1369 close_nointr_nofail(master);
1372 close_nointr(STDIN_FILENO);
1373 close_nointr(STDOUT_FILENO);
1374 close_nointr(STDERR_FILENO);
1376 close_nointr_nofail(kmsg_socket_pair[0]);
1377 kmsg_socket_pair[0] = -1;
1379 reset_all_signal_handlers();
1381 assert_se(sigemptyset(&mask) == 0);
1382 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1384 k = open_terminal(console, O_RDWR);
1385 if (k != STDIN_FILENO) {
1387 close_nointr_nofail(k);
1391 log_error("Failed to open console: %s", strerror(-k));
1395 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1396 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1397 log_error("Failed to duplicate console: %m");
1402 log_error("setsid() failed: %m");
1406 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1407 log_error("PR_SET_PDEATHSIG failed: %m");
1411 /* Mark everything as slave, so that we still
1412 * receive mounts from the real root, but don't
1413 * propagate mounts to the real root. */
1414 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1415 log_error("MS_SLAVE|MS_REC failed: %m");
1419 /* Turn directory into bind mount */
1420 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1421 log_error("Failed to make bind mount.");
1426 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1427 log_error("Failed to make read-only.");
1431 if (mount_all(arg_directory) < 0)
1434 if (copy_devnodes(arg_directory) < 0)
1437 if (setup_ptmx(arg_directory) < 0)
1440 dev_setup(arg_directory);
1442 if (setup_dev_console(arg_directory, console) < 0)
1445 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1448 close_nointr_nofail(kmsg_socket_pair[1]);
1449 kmsg_socket_pair[1] = -1;
1451 if (setup_boot_id(arg_directory) < 0)
1454 if (setup_timezone(arg_directory) < 0)
1457 if (setup_resolv_conf(arg_directory) < 0)
1460 if (setup_journal(arg_directory) < 0)
1463 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1466 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1469 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1472 if (chdir(arg_directory) < 0) {
1473 log_error("chdir(%s) failed: %m", arg_directory);
1477 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1478 log_error("mount(MS_MOVE) failed: %m");
1482 if (chroot(".") < 0) {
1483 log_error("chroot() failed: %m");
1487 if (chdir("/") < 0) {
1488 log_error("chdir() failed: %m");
1494 if (arg_private_network)
1497 if (drop_capabilities() < 0) {
1498 log_error("drop_capabilities() failed: %m");
1504 /* Note that this resolves user names
1505 * inside the container, and hence
1506 * accesses the NSS modules from the
1507 * container and not the host. This is
1510 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1511 log_error("get_user_creds() failed: %m");
1515 if (mkdir_parents_label(home, 0775) < 0) {
1516 log_error("mkdir_parents_label() failed: %m");
1520 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1521 log_error("mkdir_safe_label() failed: %m");
1525 if (initgroups((const char*)arg_user, gid) < 0) {
1526 log_error("initgroups() failed: %m");
1530 if (setresgid(gid, gid, gid) < 0) {
1531 log_error("setregid() failed: %m");
1535 if (setresuid(uid, uid, uid) < 0) {
1536 log_error("setreuid() failed: %m");
1540 /* Reset everything fully to 0, just in case */
1542 if (setgroups(0, NULL) < 0) {
1543 log_error("setgroups() failed: %m");
1547 if (setresgid(0, 0, 0) < 0) {
1548 log_error("setregid() failed: %m");
1552 if (setresuid(0, 0, 0) < 0) {
1553 log_error("setreuid() failed: %m");
1558 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1559 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1560 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1565 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1566 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1572 if (fdset_size(fds) > 0) {
1573 k = fdset_cloexec(fds, false);
1575 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1579 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1580 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1588 eventfd_read(sync_fd, &x);
1589 close_nointr_nofail(sync_fd);
1592 if (!strv_isempty(arg_setenv)) {
1595 n = strv_env_merge(2, envp, arg_setenv);
1603 env_use = (char**) envp;
1606 if (arg_selinux_context)
1607 if (setexeccon(arg_selinux_context) < 0)
1608 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1614 /* Automatically search for the init system */
1616 l = 1 + argc - optind;
1617 a = newa(char*, l + 1);
1618 memcpy(a + 1, argv + optind, l * sizeof(char*));
1620 a[0] = (char*) "/usr/lib/systemd/systemd";
1621 execve(a[0], a, env_use);
1623 a[0] = (char*) "/lib/systemd/systemd";
1624 execve(a[0], a, env_use);
1626 a[0] = (char*) "/sbin/init";
1627 execve(a[0], a, env_use);
1628 } else if (argc > optind)
1629 execvpe(argv[optind], argv + optind, env_use);
1631 chdir(home ? home : "/root");
1632 execle("/bin/bash", "-bash", NULL, env_use);
1635 log_error("execv() failed: %m");
1638 _exit(EXIT_FAILURE);
1644 r = register_machine(pid);
1648 eventfd_write(sync_fd, 1);
1649 close_nointr_nofail(sync_fd);
1652 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1661 /* Kill if it is not dead yet anyway */
1662 terminate_machine(pid);
1664 /* Redundant, but better safe than sorry */
1667 k = wait_for_terminate(pid, &status);
1675 if (status.si_code == CLD_EXITED) {
1676 r = status.si_status;
1677 if (status.si_status != 0) {
1678 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1683 log_debug("Container %s exited successfully.", arg_machine);
1685 } else if (status.si_code == CLD_KILLED &&
1686 status.si_status == SIGINT) {
1689 log_info("Container %s has been shut down.", arg_machine);
1692 } else if (status.si_code == CLD_KILLED &&
1693 status.si_status == SIGHUP) {
1696 log_info("Container %s is being rebooted.", arg_machine);
1698 } else if (status.si_code == CLD_KILLED ||
1699 status.si_code == CLD_DUMPED) {
1701 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1705 log_error("Container %s failed due to unknown reason.", arg_machine);
1715 free(arg_directory);