1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <selinux/selinux.h>
48 #include "sd-daemon.h"
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
66 #include "bus-error.h"
68 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
112 (1ULL << CAP_SYS_RESOURCE) |
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
115 (1ULL << CAP_AUDIT_CONTROL) |
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
122 static int help(void) {
124 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
125 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
126 " -h --help Show this help\n"
127 " --version Print version string\n"
128 " -D --directory=NAME Root directory for the container\n"
129 " -b --boot Boot up full system (i.e. invoke init)\n"
130 " -u --user=USER Run the command under specified user or uid\n"
131 " --uuid=UUID Set a specific machine UUID for the container\n"
132 " -M --machine=NAME Set the machine name for the container\n"
133 " -S --slice=SLICE Place the container in the specified slice\n"
134 " -Z --selinux-context=SECLABEL\n"
135 " Set the SELinux security context to be used by\n"
136 " processes in the container\n"
137 " -L --selinux-apifs-context=SECLABEL\n"
138 " Set the SELinux security context to be used by\n"
139 " API/tmpfs file systems in the container\n"
140 " --private-network Disable network in container\n"
141 " --read-only Mount the root directory read-only\n"
142 " --capability=CAP In addition to the default, retain specified\n"
144 " --drop-capability=CAP Drop the specified capability from the default set\n"
145 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
146 " -j Equivalent to --link-journal=host\n"
147 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
149 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
150 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
151 " -q --quiet Do not show status information\n",
152 program_invocation_short_name);
157 static int parse_argv(int argc, char *argv[]) {
172 static const struct option options[] = {
173 { "help", no_argument, NULL, 'h' },
174 { "version", no_argument, NULL, ARG_VERSION },
175 { "directory", required_argument, NULL, 'D' },
176 { "user", required_argument, NULL, 'u' },
177 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
178 { "boot", no_argument, NULL, 'b' },
179 { "uuid", required_argument, NULL, ARG_UUID },
180 { "read-only", no_argument, NULL, ARG_READ_ONLY },
181 { "capability", required_argument, NULL, ARG_CAPABILITY },
182 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
183 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
184 { "bind", required_argument, NULL, ARG_BIND },
185 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
186 { "machine", required_argument, NULL, 'M' },
187 { "slice", required_argument, NULL, 'S' },
188 { "setenv", required_argument, NULL, ARG_SETENV },
189 { "selinux-context", required_argument, NULL, 'Z' },
190 { "selinux-apifs-context", required_argument, NULL, 'L' },
191 { "quiet", no_argument, NULL, 'q' },
200 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
208 puts(PACKAGE_STRING);
209 puts(SYSTEMD_FEATURES);
214 arg_directory = canonicalize_file_name(optarg);
215 if (!arg_directory) {
216 log_error("Invalid root directory: %m");
224 arg_user = strdup(optarg);
230 case ARG_PRIVATE_NETWORK:
231 arg_private_network = true;
239 r = sd_id128_from_string(optarg, &arg_uuid);
241 log_error("Invalid UUID: %s", optarg);
247 arg_slice = strdup(optarg);
254 if (!hostname_is_valid(optarg)) {
255 log_error("Invalid machine name: %s", optarg);
260 arg_machine = strdup(optarg);
267 arg_selinux_context = optarg;
271 arg_selinux_apifs_context = optarg;
275 arg_read_only = true;
279 case ARG_DROP_CAPABILITY: {
283 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
287 t = strndup(word, length);
291 if (cap_from_name(t, &cap) < 0) {
292 log_error("Failed to parse capability %s.", t);
299 if (c == ARG_CAPABILITY)
300 arg_retain |= 1ULL << (uint64_t) cap;
302 arg_retain &= ~(1ULL << (uint64_t) cap);
309 arg_link_journal = LINK_GUEST;
312 case ARG_LINK_JOURNAL:
313 if (streq(optarg, "auto"))
314 arg_link_journal = LINK_AUTO;
315 else if (streq(optarg, "no"))
316 arg_link_journal = LINK_NO;
317 else if (streq(optarg, "guest"))
318 arg_link_journal = LINK_GUEST;
319 else if (streq(optarg, "host"))
320 arg_link_journal = LINK_HOST;
322 log_error("Failed to parse link journal mode %s", optarg);
330 _cleanup_free_ char *a = NULL, *b = NULL;
334 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
336 e = strchr(optarg, ':');
338 a = strndup(optarg, e - optarg);
348 if (!path_is_absolute(a) || !path_is_absolute(b)) {
349 log_error("Invalid bind mount specification: %s", optarg);
353 r = strv_extend(x, a);
357 r = strv_extend(x, b);
367 if (!env_assignment_is_valid(optarg)) {
368 log_error("Environment variable assignment '%s' is not valid.", optarg);
372 n = strv_env_set(arg_setenv, optarg);
376 strv_free(arg_setenv);
389 assert_not_reached("Unhandled option");
396 static int mount_all(const char *dest) {
398 typedef struct MountPoint {
407 static const MountPoint mount_table[] = {
408 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
409 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
410 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
411 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
412 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
413 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
414 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
415 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
417 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
418 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
425 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
426 _cleanup_free_ char *where = NULL;
428 _cleanup_free_ char *options = NULL;
433 where = strjoin(dest, "/", mount_table[k].where, NULL);
437 t = path_is_mount_point(where, true);
439 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
447 /* Skip this entry if it is not a remount. */
448 if (mount_table[k].what && t > 0)
451 mkdir_p(where, 0755);
454 if (arg_selinux_apifs_context &&
455 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
456 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
463 o = mount_table[k].options;
466 if (mount(mount_table[k].what,
469 mount_table[k].flags,
471 mount_table[k].fatal) {
473 log_error("mount(%s) failed: %m", where);
483 static int mount_binds(const char *dest, char **l, unsigned long flags) {
486 STRV_FOREACH_PAIR(x, y, l) {
488 struct stat source_st, dest_st;
491 if (stat(*x, &source_st) < 0) {
492 log_error("failed to stat %s: %m", *x);
496 where = strappenda(dest, *y);
497 r = stat(where, &dest_st);
499 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
500 log_error("The file types of %s and %s do not match. Refusing bind mount",
504 } else if (errno == ENOENT) {
505 r = mkdir_parents_label(where, 0755);
507 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
511 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
514 /* Create the mount point, but be conservative -- refuse to create block
515 * and char devices. */
516 if (S_ISDIR(source_st.st_mode))
517 mkdir_label(where, 0755);
518 else if (S_ISFIFO(source_st.st_mode))
520 else if (S_ISSOCK(source_st.st_mode))
521 mknod(where, 0644 | S_IFSOCK, 0);
522 else if (S_ISREG(source_st.st_mode))
525 log_error("Refusing to create mountpoint for file: %s", *x);
529 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
530 log_error("mount(%s) failed: %m", where);
534 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
535 log_error("mount(%s) failed: %m", where);
543 static int setup_timezone(const char *dest) {
544 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
550 /* Fix the timezone, if possible */
551 r = readlink_malloc("/etc/localtime", &p);
553 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
557 z = path_startswith(p, "../usr/share/zoneinfo/");
559 z = path_startswith(p, "/usr/share/zoneinfo/");
561 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
565 where = strappend(dest, "/etc/localtime");
569 r = readlink_malloc(where, &q);
571 y = path_startswith(q, "../usr/share/zoneinfo/");
573 y = path_startswith(q, "/usr/share/zoneinfo/");
576 /* Already pointing to the right place? Then do nothing .. */
577 if (y && streq(y, z))
581 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
585 if (access(check, F_OK) < 0) {
586 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
590 what = strappend("../usr/share/zoneinfo/", z);
595 if (symlink(what, where) < 0) {
596 log_error("Failed to correct timezone of container: %m");
603 static int setup_resolv_conf(const char *dest) {
604 char _cleanup_free_ *where = NULL;
608 if (arg_private_network)
611 /* Fix resolv.conf, if possible */
612 where = strappend(dest, "/etc/resolv.conf");
616 /* We don't really care for the results of this really. If it
617 * fails, it fails, but meh... */
618 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
623 static int setup_boot_id(const char *dest) {
624 _cleanup_free_ char *from = NULL, *to = NULL;
631 /* Generate a new randomized boot ID, so that each boot-up of
632 * the container gets a new one */
634 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
635 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
639 r = sd_id128_randomize(&rnd);
641 log_error("Failed to generate random boot id: %s", strerror(-r));
645 snprintf(as_uuid, sizeof(as_uuid),
646 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
647 SD_ID128_FORMAT_VAL(rnd));
648 char_array_0(as_uuid);
650 r = write_string_file(from, as_uuid);
652 log_error("Failed to write boot id: %s", strerror(-r));
656 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
657 log_error("Failed to bind mount boot id: %m");
659 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
660 log_warning("Failed to make boot id read-only: %m");
666 static int copy_devnodes(const char *dest) {
668 static const char devnodes[] =
678 _cleanup_umask_ mode_t u;
684 NULSTR_FOREACH(d, devnodes) {
685 _cleanup_free_ char *from = NULL, *to = NULL;
688 from = strappend("/dev/", d);
689 to = strjoin(dest, "/dev/", d, NULL);
693 if (stat(from, &st) < 0) {
695 if (errno != ENOENT) {
696 log_error("Failed to stat %s: %m", from);
700 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
702 log_error("%s is not a char or block device, cannot copy", from);
705 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
707 log_error("mknod(%s) failed: %m", dest);
715 static int setup_ptmx(const char *dest) {
716 _cleanup_free_ char *p = NULL;
718 p = strappend(dest, "/dev/ptmx");
722 if (symlink("pts/ptmx", p) < 0) {
723 log_error("Failed to create /dev/ptmx symlink: %m");
730 static int setup_dev_console(const char *dest, const char *console) {
732 _cleanup_free_ char *to = NULL;
734 _cleanup_umask_ mode_t u;
741 if (stat(console, &st) < 0) {
742 log_error("Failed to stat %s: %m", console);
745 } else if (!S_ISCHR(st.st_mode)) {
746 log_error("/dev/console is not a char device");
750 r = chmod_and_chown(console, 0600, 0, 0);
752 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
756 if (asprintf(&to, "%s/dev/console", dest) < 0)
759 /* We need to bind mount the right tty to /dev/console since
760 * ptys can only exist on pts file systems. To have something
761 * to bind mount things on we create a device node first, that
762 * has the right major/minor (note that the major minor
763 * doesn't actually matter here, since we mount it over
766 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
767 log_error("mknod() for /dev/console failed: %m");
771 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
772 log_error("Bind mount for /dev/console failed: %m");
779 static int setup_kmsg(const char *dest, int kmsg_socket) {
780 _cleanup_free_ char *from = NULL, *to = NULL;
782 _cleanup_umask_ mode_t u;
784 struct cmsghdr cmsghdr;
785 uint8_t buf[CMSG_SPACE(sizeof(int))];
788 .msg_control = &control,
789 .msg_controllen = sizeof(control),
791 struct cmsghdr *cmsg;
794 assert(kmsg_socket >= 0);
798 /* We create the kmsg FIFO as /dev/kmsg, but immediately
799 * delete it after bind mounting it to /proc/kmsg. While FIFOs
800 * on the reading side behave very similar to /proc/kmsg,
801 * their writing side behaves differently from /dev/kmsg in
802 * that writing blocks when nothing is reading. In order to
803 * avoid any problems with containers deadlocking due to this
804 * we simply make /dev/kmsg unavailable to the container. */
805 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
806 asprintf(&to, "%s/proc/kmsg", dest) < 0)
809 if (mkfifo(from, 0600) < 0) {
810 log_error("mkfifo() for /dev/kmsg failed: %m");
814 r = chmod_and_chown(from, 0600, 0, 0);
816 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
820 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
821 log_error("Bind mount for /proc/kmsg failed: %m");
825 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
827 log_error("Failed to open fifo: %m");
831 cmsg = CMSG_FIRSTHDR(&mh);
832 cmsg->cmsg_level = SOL_SOCKET;
833 cmsg->cmsg_type = SCM_RIGHTS;
834 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
835 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
837 mh.msg_controllen = cmsg->cmsg_len;
839 /* Store away the fd in the socket, so that it stays open as
840 * long as we run the child */
841 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
842 close_nointr_nofail(fd);
845 log_error("Failed to send FIFO fd: %m");
849 /* And now make the FIFO unavailable as /dev/kmsg... */
854 static int setup_hostname(void) {
856 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
862 static int setup_journal(const char *directory) {
863 sd_id128_t machine_id, this_id;
864 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
868 p = strappend(directory, "/etc/machine-id");
872 r = read_one_line_file(p, &b);
873 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
876 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
881 if (isempty(id) && arg_link_journal == LINK_AUTO)
884 /* Verify validity */
885 r = sd_id128_from_string(id, &machine_id);
887 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
891 r = sd_id128_get_machine(&this_id);
893 log_error("Failed to retrieve machine ID: %s", strerror(-r));
897 if (sd_id128_equal(machine_id, this_id)) {
898 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
899 "Host and machine ids are equal (%s): refusing to link journals", id);
900 if (arg_link_journal == LINK_AUTO)
906 if (arg_link_journal == LINK_NO)
910 p = strappend("/var/log/journal/", id);
911 q = strjoin(directory, "/var/log/journal/", id, NULL);
915 if (path_is_mount_point(p, false) > 0) {
916 if (arg_link_journal != LINK_AUTO) {
917 log_error("%s: already a mount point, refusing to use for journal", p);
924 if (path_is_mount_point(q, false) > 0) {
925 if (arg_link_journal != LINK_AUTO) {
926 log_error("%s: already a mount point, refusing to use for journal", q);
933 r = readlink_and_make_absolute(p, &d);
935 if ((arg_link_journal == LINK_GUEST ||
936 arg_link_journal == LINK_AUTO) &&
939 r = mkdir_p(q, 0755);
941 log_warning("failed to create directory %s: %m", q);
946 log_error("Failed to remove symlink %s: %m", p);
949 } else if (r == -EINVAL) {
951 if (arg_link_journal == LINK_GUEST &&
954 if (errno == ENOTDIR) {
955 log_error("%s already exists and is neither a symlink nor a directory", p);
958 log_error("Failed to remove %s: %m", p);
962 } else if (r != -ENOENT) {
963 log_error("readlink(%s) failed: %m", p);
967 if (arg_link_journal == LINK_GUEST) {
969 if (symlink(q, p) < 0) {
970 log_error("Failed to symlink %s to %s: %m", q, p);
974 r = mkdir_p(q, 0755);
976 log_warning("failed to create directory %s: %m", q);
980 if (arg_link_journal == LINK_HOST) {
981 r = mkdir_p(p, 0755);
983 log_error("Failed to create %s: %m", p);
987 } else if (access(p, F_OK) < 0)
990 if (dir_is_empty(q) == 0) {
991 log_error("%s not empty.", q);
995 r = mkdir_p(q, 0755);
997 log_error("Failed to create %s: %m", q);
1001 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1002 log_error("Failed to bind mount journal from host into guest: %m");
1009 static int setup_kdbus(const char *dest, const char *path) {
1015 p = strappenda(dest, "/dev/kdbus");
1016 if (mkdir(p, 0755) < 0) {
1017 log_error("Failed to create kdbus path: %m");
1021 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1022 log_error("Failed to mount kdbus domain path: %m");
1029 static int drop_capabilities(void) {
1030 return capability_bounding_set_drop(~arg_retain, false);
1033 static int register_machine(pid_t pid) {
1034 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1035 _cleanup_bus_unref_ sd_bus *bus = NULL;
1038 r = sd_bus_default_system(&bus);
1040 log_error("Failed to open system bus: %s", strerror(-r));
1044 r = sd_bus_call_method(
1046 "org.freedesktop.machine1",
1047 "/org/freedesktop/machine1",
1048 "org.freedesktop.machine1.Manager",
1054 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1058 strempty(arg_directory),
1059 !isempty(arg_slice), "Slice", "s", arg_slice);
1061 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1068 static int terminate_machine(pid_t pid) {
1069 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1070 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1071 _cleanup_bus_unref_ sd_bus *bus = NULL;
1075 r = sd_bus_default_system(&bus);
1077 log_error("Failed to open system bus: %s", strerror(-r));
1081 r = sd_bus_call_method(
1083 "org.freedesktop.machine1",
1084 "/org/freedesktop/machine1",
1085 "org.freedesktop.machine1.Manager",
1092 /* Note that the machine might already have been
1093 * cleaned up automatically, hence don't consider it a
1094 * failure if we cannot get the machine object. */
1095 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1099 r = sd_bus_message_read(reply, "o", &path);
1101 return bus_log_parse_error(r);
1103 r = sd_bus_call_method(
1105 "org.freedesktop.machine1",
1107 "org.freedesktop.machine1.Machine",
1113 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1120 static bool audit_enabled(void) {
1123 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1125 close_nointr_nofail(fd);
1131 int main(int argc, char *argv[]) {
1133 int r = EXIT_FAILURE, k;
1134 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1136 const char *console = NULL;
1138 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1139 _cleanup_fdset_free_ FDSet *fds = NULL;
1140 _cleanup_free_ char *kdbus_domain = NULL;
1143 log_parse_environment();
1146 k = parse_argv(argc, argv);
1154 if (arg_directory) {
1157 p = path_make_absolute_cwd(arg_directory);
1158 free(arg_directory);
1161 arg_directory = get_current_dir_name();
1163 if (!arg_directory) {
1164 log_error("Failed to determine path, please use -D.");
1168 path_kill_slashes(arg_directory);
1171 arg_machine = strdup(basename(arg_directory));
1177 hostname_cleanup(arg_machine, false);
1178 if (isempty(arg_machine)) {
1179 log_error("Failed to determine machine name automatically, please use -M.");
1184 if (geteuid() != 0) {
1185 log_error("Need to be root.");
1189 if (sd_booted() <= 0) {
1190 log_error("Not running on a systemd system.");
1194 if (arg_boot && audit_enabled()) {
1195 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1196 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1197 "line before using systemd-nspawn. Sleeping for 5s...\n");
1201 if (path_equal(arg_directory, "/")) {
1202 log_error("Spawning container on root directory not supported.");
1206 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1207 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1212 n_fd_passed = sd_listen_fds(false);
1213 if (n_fd_passed > 0) {
1214 k = fdset_new_listen_fds(&fds, false);
1216 log_error("Failed to collect file descriptors: %s", strerror(-k));
1220 fdset_close_others(fds);
1223 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1225 log_error("Failed to acquire pseudo tty: %m");
1229 console = ptsname(master);
1231 log_error("Failed to determine tty name: %m");
1236 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1238 if (unlockpt(master) < 0) {
1239 log_error("Failed to unlock tty: %m");
1243 ns = strappenda("machine-", arg_machine);
1244 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1246 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1248 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1250 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1251 log_error("Failed to create kmsg socket pair: %m");
1255 sd_notify(0, "READY=1");
1257 assert_se(sigemptyset(&mask) == 0);
1258 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1259 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1264 sync_fd = eventfd(0, EFD_CLOEXEC);
1266 log_error("Failed to create event fd: %m");
1270 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1272 if (errno == EINVAL)
1273 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1275 log_error("clone() failed: %m");
1282 const char *home = NULL;
1283 uid_t uid = (uid_t) -1;
1284 gid_t gid = (gid_t) -1;
1286 const char *envp[] = {
1287 "PATH=" DEFAULT_PATH_SPLIT_USR,
1288 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1293 NULL, /* container_uuid */
1294 NULL, /* LISTEN_FDS */
1295 NULL, /* LISTEN_PID */
1301 envp[n_env] = strv_find_prefix(environ, "TERM=");
1305 close_nointr_nofail(master);
1308 close_nointr(STDIN_FILENO);
1309 close_nointr(STDOUT_FILENO);
1310 close_nointr(STDERR_FILENO);
1312 close_nointr_nofail(kmsg_socket_pair[0]);
1313 kmsg_socket_pair[0] = -1;
1315 reset_all_signal_handlers();
1317 assert_se(sigemptyset(&mask) == 0);
1318 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1320 k = open_terminal(console, O_RDWR);
1321 if (k != STDIN_FILENO) {
1323 close_nointr_nofail(k);
1327 log_error("Failed to open console: %s", strerror(-k));
1331 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1332 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1333 log_error("Failed to duplicate console: %m");
1338 log_error("setsid() failed: %m");
1342 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1343 log_error("PR_SET_PDEATHSIG failed: %m");
1347 /* Mark everything as slave, so that we still
1348 * receive mounts from the real root, but don't
1349 * propagate mounts to the real root. */
1350 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1351 log_error("MS_SLAVE|MS_REC failed: %m");
1355 /* Turn directory into bind mount */
1356 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1357 log_error("Failed to make bind mount.");
1362 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1363 log_error("Failed to make read-only.");
1367 if (mount_all(arg_directory) < 0)
1370 if (copy_devnodes(arg_directory) < 0)
1373 if (setup_ptmx(arg_directory) < 0)
1376 dev_setup(arg_directory);
1378 if (setup_dev_console(arg_directory, console) < 0)
1381 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1384 close_nointr_nofail(kmsg_socket_pair[1]);
1385 kmsg_socket_pair[1] = -1;
1387 if (setup_boot_id(arg_directory) < 0)
1390 if (setup_timezone(arg_directory) < 0)
1393 if (setup_resolv_conf(arg_directory) < 0)
1396 if (setup_journal(arg_directory) < 0)
1399 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1402 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1405 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1408 if (chdir(arg_directory) < 0) {
1409 log_error("chdir(%s) failed: %m", arg_directory);
1413 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1414 log_error("mount(MS_MOVE) failed: %m");
1418 if (chroot(".") < 0) {
1419 log_error("chroot() failed: %m");
1423 if (chdir("/") < 0) {
1424 log_error("chdir() failed: %m");
1432 if (drop_capabilities() < 0) {
1433 log_error("drop_capabilities() failed: %m");
1439 /* Note that this resolves user names
1440 * inside the container, and hence
1441 * accesses the NSS modules from the
1442 * container and not the host. This is
1445 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1446 log_error("get_user_creds() failed: %m");
1450 if (mkdir_parents_label(home, 0775) < 0) {
1451 log_error("mkdir_parents_label() failed: %m");
1455 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1456 log_error("mkdir_safe_label() failed: %m");
1460 if (initgroups((const char*)arg_user, gid) < 0) {
1461 log_error("initgroups() failed: %m");
1465 if (setresgid(gid, gid, gid) < 0) {
1466 log_error("setregid() failed: %m");
1470 if (setresuid(uid, uid, uid) < 0) {
1471 log_error("setreuid() failed: %m");
1475 /* Reset everything fully to 0, just in case */
1477 if (setgroups(0, NULL) < 0) {
1478 log_error("setgroups() failed: %m");
1482 if (setresgid(0, 0, 0) < 0) {
1483 log_error("setregid() failed: %m");
1487 if (setresuid(0, 0, 0) < 0) {
1488 log_error("setreuid() failed: %m");
1493 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1494 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1495 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1500 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1501 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1507 if (fdset_size(fds) > 0) {
1508 k = fdset_cloexec(fds, false);
1510 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1514 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1515 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1523 eventfd_read(sync_fd, &x);
1524 close_nointr_nofail(sync_fd);
1527 if (!strv_isempty(arg_setenv)) {
1530 n = strv_env_merge(2, envp, arg_setenv);
1538 env_use = (char**) envp;
1541 if (arg_selinux_context)
1542 if (setexeccon(arg_selinux_context) < 0)
1543 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1549 /* Automatically search for the init system */
1551 l = 1 + argc - optind;
1552 a = newa(char*, l + 1);
1553 memcpy(a + 1, argv + optind, l * sizeof(char*));
1555 a[0] = (char*) "/usr/lib/systemd/systemd";
1556 execve(a[0], a, env_use);
1558 a[0] = (char*) "/lib/systemd/systemd";
1559 execve(a[0], a, env_use);
1561 a[0] = (char*) "/sbin/init";
1562 execve(a[0], a, env_use);
1563 } else if (argc > optind)
1564 execvpe(argv[optind], argv + optind, env_use);
1566 chdir(home ? home : "/root");
1567 execle("/bin/bash", "-bash", NULL, env_use);
1570 log_error("execv() failed: %m");
1573 _exit(EXIT_FAILURE);
1579 r = register_machine(pid);
1583 eventfd_write(sync_fd, 1);
1584 close_nointr_nofail(sync_fd);
1587 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1596 /* Kill if it is not dead yet anyway */
1597 terminate_machine(pid);
1599 /* Redundant, but better safe than sorry */
1602 k = wait_for_terminate(pid, &status);
1610 if (status.si_code == CLD_EXITED) {
1611 r = status.si_status;
1612 if (status.si_status != 0) {
1613 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1618 log_debug("Container %s exited successfully.", arg_machine);
1620 } else if (status.si_code == CLD_KILLED &&
1621 status.si_status == SIGINT) {
1624 log_info("Container %s has been shut down.", arg_machine);
1627 } else if (status.si_code == CLD_KILLED &&
1628 status.si_status == SIGHUP) {
1631 log_info("Container %s is being rebooted.", arg_machine);
1633 } else if (status.si_code == CLD_KILLED ||
1634 status.si_code == CLD_DUMPED) {
1636 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1640 log_error("Container %s failed due to unknown reason.", arg_machine);
1650 free(arg_directory);