1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <selinux/selinux.h>
48 #include "sd-daemon.h"
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
66 #include "bus-error.h"
68 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_process_label = NULL;
84 static char *arg_file_label = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
112 (1ULL << CAP_SYS_RESOURCE) |
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
115 (1ULL << CAP_AUDIT_CONTROL) |
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
122 static int help(void) {
124 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
125 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
126 " -h --help Show this help\n"
127 " --version Print version string\n"
128 " -D --directory=NAME Root directory for the container\n"
129 " -b --boot Boot up full system (i.e. invoke init)\n"
130 " -u --user=USER Run the command under specified user or uid\n"
131 " --uuid=UUID Set a specific machine UUID for the container\n"
132 " -M --machine=NAME Set the machine name for the container\n"
133 " -S --slice=SLICE Place the container in the specified slice\n"
134 " -L --file-label=LABEL Set the MAC file label to be used by tmpfs file\n"
135 " systems in the container\n"
136 " -Z --process-label=LABEL Set the MAC label to be used by processes in\n"
138 " --private-network Disable network in container\n"
139 " --read-only Mount the root directory read-only\n"
140 " --capability=CAP In addition to the default, retain specified\n"
142 " --drop-capability=CAP Drop the specified capability from the default set\n"
143 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
144 " -j Equivalent to --link-journal=host\n"
145 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
147 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
148 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
149 " -q --quiet Do not show status information\n",
150 program_invocation_short_name);
155 static int parse_argv(int argc, char *argv[]) {
170 static const struct option options[] = {
171 { "help", no_argument, NULL, 'h' },
172 { "version", no_argument, NULL, ARG_VERSION },
173 { "directory", required_argument, NULL, 'D' },
174 { "user", required_argument, NULL, 'u' },
175 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
176 { "boot", no_argument, NULL, 'b' },
177 { "uuid", required_argument, NULL, ARG_UUID },
178 { "read-only", no_argument, NULL, ARG_READ_ONLY },
179 { "capability", required_argument, NULL, ARG_CAPABILITY },
180 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
181 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
182 { "bind", required_argument, NULL, ARG_BIND },
183 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
184 { "machine", required_argument, NULL, 'M' },
185 { "slice", required_argument, NULL, 'S' },
186 { "setenv", required_argument, NULL, ARG_SETENV },
187 { "process-label", required_argument, NULL, 'Z' },
188 { "file-label", required_argument, NULL, 'L' },
189 { "quiet", no_argument, NULL, 'q' },
198 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
206 puts(PACKAGE_STRING);
207 puts(SYSTEMD_FEATURES);
212 arg_directory = canonicalize_file_name(optarg);
213 if (!arg_directory) {
214 log_error("Invalid root directory: %m");
222 arg_user = strdup(optarg);
228 case ARG_PRIVATE_NETWORK:
229 arg_private_network = true;
237 r = sd_id128_from_string(optarg, &arg_uuid);
239 log_error("Invalid UUID: %s", optarg);
245 arg_slice = strdup(optarg);
252 if (!hostname_is_valid(optarg)) {
253 log_error("Invalid machine name: %s", optarg);
258 arg_machine = strdup(optarg);
265 arg_file_label = optarg;
269 arg_process_label = optarg;
273 arg_read_only = true;
277 case ARG_DROP_CAPABILITY: {
281 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
285 t = strndup(word, length);
289 if (cap_from_name(t, &cap) < 0) {
290 log_error("Failed to parse capability %s.", t);
297 if (c == ARG_CAPABILITY)
298 arg_retain |= 1ULL << (uint64_t) cap;
300 arg_retain &= ~(1ULL << (uint64_t) cap);
307 arg_link_journal = LINK_GUEST;
310 case ARG_LINK_JOURNAL:
311 if (streq(optarg, "auto"))
312 arg_link_journal = LINK_AUTO;
313 else if (streq(optarg, "no"))
314 arg_link_journal = LINK_NO;
315 else if (streq(optarg, "guest"))
316 arg_link_journal = LINK_GUEST;
317 else if (streq(optarg, "host"))
318 arg_link_journal = LINK_HOST;
320 log_error("Failed to parse link journal mode %s", optarg);
328 _cleanup_free_ char *a = NULL, *b = NULL;
332 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
334 e = strchr(optarg, ':');
336 a = strndup(optarg, e - optarg);
346 if (!path_is_absolute(a) || !path_is_absolute(b)) {
347 log_error("Invalid bind mount specification: %s", optarg);
351 r = strv_extend(x, a);
355 r = strv_extend(x, b);
365 if (!env_assignment_is_valid(optarg)) {
366 log_error("Environment variable assignment '%s' is not valid.", optarg);
370 n = strv_env_set(arg_setenv, optarg);
374 strv_free(arg_setenv);
387 assert_not_reached("Unhandled option");
394 static int mount_all(const char *dest) {
396 typedef struct MountPoint {
405 static const MountPoint mount_table[] = {
406 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
407 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
408 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
409 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
410 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
411 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
412 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
413 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
415 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
416 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
423 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
424 _cleanup_free_ char *where = NULL;
426 _cleanup_free_ char *options = NULL;
431 where = strjoin(dest, "/", mount_table[k].where, NULL);
435 t = path_is_mount_point(where, true);
437 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
445 /* Skip this entry if it is not a remount. */
446 if (mount_table[k].what && t > 0)
449 mkdir_p(where, 0755);
452 if (arg_file_label && (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
453 options = strjoin(mount_table[k].options, ",context=\"", arg_file_label, "\"", NULL);
460 o = mount_table[k].options;
463 if (mount(mount_table[k].what,
466 mount_table[k].flags,
468 mount_table[k].fatal) {
470 log_error("mount(%s) failed: %m", where);
480 static int mount_binds(const char *dest, char **l, unsigned long flags) {
483 STRV_FOREACH_PAIR(x, y, l) {
485 struct stat source_st, dest_st;
488 if (stat(*x, &source_st) < 0) {
489 log_error("failed to stat %s: %m", *x);
493 where = strappenda(dest, *y);
494 r = stat(where, &dest_st);
496 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
497 log_error("The file types of %s and %s do not match. Refusing bind mount",
501 } else if (errno == ENOENT) {
502 r = mkdir_parents_label(where, 0755);
504 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
508 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
511 /* Create the mount point, but be conservative -- refuse to create block
512 * and char devices. */
513 if (S_ISDIR(source_st.st_mode))
514 mkdir_label(where, 0755);
515 else if (S_ISFIFO(source_st.st_mode))
517 else if (S_ISSOCK(source_st.st_mode))
518 mknod(where, 0644 | S_IFSOCK, 0);
519 else if (S_ISREG(source_st.st_mode))
522 log_error("Refusing to create mountpoint for file: %s", *x);
526 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
527 log_error("mount(%s) failed: %m", where);
531 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
532 log_error("mount(%s) failed: %m", where);
540 static int setup_timezone(const char *dest) {
541 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
547 /* Fix the timezone, if possible */
548 r = readlink_malloc("/etc/localtime", &p);
550 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
554 z = path_startswith(p, "../usr/share/zoneinfo/");
556 z = path_startswith(p, "/usr/share/zoneinfo/");
558 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
562 where = strappend(dest, "/etc/localtime");
566 r = readlink_malloc(where, &q);
568 y = path_startswith(q, "../usr/share/zoneinfo/");
570 y = path_startswith(q, "/usr/share/zoneinfo/");
573 /* Already pointing to the right place? Then do nothing .. */
574 if (y && streq(y, z))
578 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
582 if (access(check, F_OK) < 0) {
583 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
587 what = strappend("../usr/share/zoneinfo/", z);
592 if (symlink(what, where) < 0) {
593 log_error("Failed to correct timezone of container: %m");
600 static int setup_resolv_conf(const char *dest) {
601 char _cleanup_free_ *where = NULL;
605 if (arg_private_network)
608 /* Fix resolv.conf, if possible */
609 where = strappend(dest, "/etc/resolv.conf");
613 /* We don't really care for the results of this really. If it
614 * fails, it fails, but meh... */
615 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
620 static int setup_boot_id(const char *dest) {
621 _cleanup_free_ char *from = NULL, *to = NULL;
628 /* Generate a new randomized boot ID, so that each boot-up of
629 * the container gets a new one */
631 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
632 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
636 r = sd_id128_randomize(&rnd);
638 log_error("Failed to generate random boot id: %s", strerror(-r));
642 snprintf(as_uuid, sizeof(as_uuid),
643 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
644 SD_ID128_FORMAT_VAL(rnd));
645 char_array_0(as_uuid);
647 r = write_string_file(from, as_uuid);
649 log_error("Failed to write boot id: %s", strerror(-r));
653 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
654 log_error("Failed to bind mount boot id: %m");
656 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
657 log_warning("Failed to make boot id read-only: %m");
663 static int copy_devnodes(const char *dest) {
665 static const char devnodes[] =
675 _cleanup_umask_ mode_t u;
681 NULSTR_FOREACH(d, devnodes) {
682 _cleanup_free_ char *from = NULL, *to = NULL;
685 from = strappend("/dev/", d);
686 to = strjoin(dest, "/dev/", d, NULL);
690 if (stat(from, &st) < 0) {
692 if (errno != ENOENT) {
693 log_error("Failed to stat %s: %m", from);
697 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
699 log_error("%s is not a char or block device, cannot copy", from);
702 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
704 log_error("mknod(%s) failed: %m", dest);
712 static int setup_ptmx(const char *dest) {
713 _cleanup_free_ char *p = NULL;
715 p = strappend(dest, "/dev/ptmx");
719 if (symlink("pts/ptmx", p) < 0) {
720 log_error("Failed to create /dev/ptmx symlink: %m");
727 static int setup_dev_console(const char *dest, const char *console) {
729 _cleanup_free_ char *to = NULL;
731 _cleanup_umask_ mode_t u;
738 if (stat(console, &st) < 0) {
739 log_error("Failed to stat %s: %m", console);
742 } else if (!S_ISCHR(st.st_mode)) {
743 log_error("/dev/console is not a char device");
747 r = chmod_and_chown(console, 0600, 0, 0);
749 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
753 if (asprintf(&to, "%s/dev/console", dest) < 0)
756 /* We need to bind mount the right tty to /dev/console since
757 * ptys can only exist on pts file systems. To have something
758 * to bind mount things on we create a device node first, that
759 * has the right major/minor (note that the major minor
760 * doesn't actually matter here, since we mount it over
763 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
764 log_error("mknod() for /dev/console failed: %m");
768 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
769 log_error("Bind mount for /dev/console failed: %m");
776 static int setup_kmsg(const char *dest, int kmsg_socket) {
777 _cleanup_free_ char *from = NULL, *to = NULL;
779 _cleanup_umask_ mode_t u;
781 struct cmsghdr cmsghdr;
782 uint8_t buf[CMSG_SPACE(sizeof(int))];
785 .msg_control = &control,
786 .msg_controllen = sizeof(control),
788 struct cmsghdr *cmsg;
791 assert(kmsg_socket >= 0);
795 /* We create the kmsg FIFO as /dev/kmsg, but immediately
796 * delete it after bind mounting it to /proc/kmsg. While FIFOs
797 * on the reading side behave very similar to /proc/kmsg,
798 * their writing side behaves differently from /dev/kmsg in
799 * that writing blocks when nothing is reading. In order to
800 * avoid any problems with containers deadlocking due to this
801 * we simply make /dev/kmsg unavailable to the container. */
802 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
803 asprintf(&to, "%s/proc/kmsg", dest) < 0)
806 if (mkfifo(from, 0600) < 0) {
807 log_error("mkfifo() for /dev/kmsg failed: %m");
811 r = chmod_and_chown(from, 0600, 0, 0);
813 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
817 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
818 log_error("Bind mount for /proc/kmsg failed: %m");
822 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
824 log_error("Failed to open fifo: %m");
828 cmsg = CMSG_FIRSTHDR(&mh);
829 cmsg->cmsg_level = SOL_SOCKET;
830 cmsg->cmsg_type = SCM_RIGHTS;
831 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
832 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
834 mh.msg_controllen = cmsg->cmsg_len;
836 /* Store away the fd in the socket, so that it stays open as
837 * long as we run the child */
838 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
839 close_nointr_nofail(fd);
842 log_error("Failed to send FIFO fd: %m");
846 /* And now make the FIFO unavailable as /dev/kmsg... */
851 static int setup_hostname(void) {
853 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
859 static int setup_journal(const char *directory) {
860 sd_id128_t machine_id, this_id;
861 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
865 p = strappend(directory, "/etc/machine-id");
869 r = read_one_line_file(p, &b);
870 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
873 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
878 if (isempty(id) && arg_link_journal == LINK_AUTO)
881 /* Verify validity */
882 r = sd_id128_from_string(id, &machine_id);
884 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
888 r = sd_id128_get_machine(&this_id);
890 log_error("Failed to retrieve machine ID: %s", strerror(-r));
894 if (sd_id128_equal(machine_id, this_id)) {
895 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
896 "Host and machine ids are equal (%s): refusing to link journals", id);
897 if (arg_link_journal == LINK_AUTO)
903 if (arg_link_journal == LINK_NO)
907 p = strappend("/var/log/journal/", id);
908 q = strjoin(directory, "/var/log/journal/", id, NULL);
912 if (path_is_mount_point(p, false) > 0) {
913 if (arg_link_journal != LINK_AUTO) {
914 log_error("%s: already a mount point, refusing to use for journal", p);
921 if (path_is_mount_point(q, false) > 0) {
922 if (arg_link_journal != LINK_AUTO) {
923 log_error("%s: already a mount point, refusing to use for journal", q);
930 r = readlink_and_make_absolute(p, &d);
932 if ((arg_link_journal == LINK_GUEST ||
933 arg_link_journal == LINK_AUTO) &&
936 r = mkdir_p(q, 0755);
938 log_warning("failed to create directory %s: %m", q);
943 log_error("Failed to remove symlink %s: %m", p);
946 } else if (r == -EINVAL) {
948 if (arg_link_journal == LINK_GUEST &&
951 if (errno == ENOTDIR) {
952 log_error("%s already exists and is neither a symlink nor a directory", p);
955 log_error("Failed to remove %s: %m", p);
959 } else if (r != -ENOENT) {
960 log_error("readlink(%s) failed: %m", p);
964 if (arg_link_journal == LINK_GUEST) {
966 if (symlink(q, p) < 0) {
967 log_error("Failed to symlink %s to %s: %m", q, p);
971 r = mkdir_p(q, 0755);
973 log_warning("failed to create directory %s: %m", q);
977 if (arg_link_journal == LINK_HOST) {
978 r = mkdir_p(p, 0755);
980 log_error("Failed to create %s: %m", p);
984 } else if (access(p, F_OK) < 0)
987 if (dir_is_empty(q) == 0) {
988 log_error("%s not empty.", q);
992 r = mkdir_p(q, 0755);
994 log_error("Failed to create %s: %m", q);
998 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
999 log_error("Failed to bind mount journal from host into guest: %m");
1006 static int setup_kdbus(const char *dest, const char *path) {
1012 p = strappenda(dest, "/dev/kdbus");
1013 if (mkdir(p, 0755) < 0) {
1014 log_error("Failed to create kdbus path: %m");
1018 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1019 log_error("Failed to mount kdbus domain path: %m");
1026 static int drop_capabilities(void) {
1027 return capability_bounding_set_drop(~arg_retain, false);
1030 static int register_machine(pid_t pid) {
1031 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1032 _cleanup_bus_unref_ sd_bus *bus = NULL;
1035 r = sd_bus_default_system(&bus);
1037 log_error("Failed to open system bus: %s", strerror(-r));
1041 r = sd_bus_call_method(
1043 "org.freedesktop.machine1",
1044 "/org/freedesktop/machine1",
1045 "org.freedesktop.machine1.Manager",
1051 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1055 strempty(arg_directory),
1056 !isempty(arg_slice), "Slice", "s", arg_slice);
1058 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1065 static int terminate_machine(pid_t pid) {
1066 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1067 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1068 _cleanup_bus_unref_ sd_bus *bus = NULL;
1072 r = sd_bus_default_system(&bus);
1074 log_error("Failed to open system bus: %s", strerror(-r));
1078 r = sd_bus_call_method(
1080 "org.freedesktop.machine1",
1081 "/org/freedesktop/machine1",
1082 "org.freedesktop.machine1.Manager",
1089 /* Note that the machine might already have been
1090 * cleaned up automatically, hence don't consider it a
1091 * failure if we cannot get the machine object. */
1092 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1096 r = sd_bus_message_read(reply, "o", &path);
1098 return bus_log_parse_error(r);
1100 r = sd_bus_call_method(
1102 "org.freedesktop.machine1",
1104 "org.freedesktop.machine1.Machine",
1110 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1117 static bool audit_enabled(void) {
1120 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1122 close_nointr_nofail(fd);
1128 int main(int argc, char *argv[]) {
1130 int r = EXIT_FAILURE, k;
1131 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1133 const char *console = NULL;
1135 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1136 _cleanup_fdset_free_ FDSet *fds = NULL;
1137 _cleanup_free_ char *kdbus_domain = NULL;
1140 log_parse_environment();
1143 k = parse_argv(argc, argv);
1151 if (arg_directory) {
1154 p = path_make_absolute_cwd(arg_directory);
1155 free(arg_directory);
1158 arg_directory = get_current_dir_name();
1160 if (!arg_directory) {
1161 log_error("Failed to determine path, please use -D.");
1165 path_kill_slashes(arg_directory);
1168 arg_machine = strdup(basename(arg_directory));
1174 hostname_cleanup(arg_machine, false);
1175 if (isempty(arg_machine)) {
1176 log_error("Failed to determine machine name automatically, please use -M.");
1181 if (geteuid() != 0) {
1182 log_error("Need to be root.");
1186 if (sd_booted() <= 0) {
1187 log_error("Not running on a systemd system.");
1191 if (arg_boot && audit_enabled()) {
1192 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1193 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1194 "line before using systemd-nspawn. Sleeping for 5s...\n");
1198 if (path_equal(arg_directory, "/")) {
1199 log_error("Spawning container on root directory not supported.");
1203 if (path_is_os_tree(arg_directory) <= 0) {
1204 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1209 n_fd_passed = sd_listen_fds(false);
1210 if (n_fd_passed > 0) {
1211 k = fdset_new_listen_fds(&fds, false);
1213 log_error("Failed to collect file descriptors: %s", strerror(-k));
1217 fdset_close_others(fds);
1220 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1222 log_error("Failed to acquire pseudo tty: %m");
1226 console = ptsname(master);
1228 log_error("Failed to determine tty name: %m");
1233 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1235 if (unlockpt(master) < 0) {
1236 log_error("Failed to unlock tty: %m");
1240 ns = strappenda("machine-", arg_machine);
1241 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1243 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1245 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1247 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1248 log_error("Failed to create kmsg socket pair: %m");
1252 sd_notify(0, "READY=1");
1254 assert_se(sigemptyset(&mask) == 0);
1255 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1256 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1261 sync_fd = eventfd(0, EFD_CLOEXEC);
1263 log_error("Failed to create event fd: %m");
1267 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1269 if (errno == EINVAL)
1270 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1272 log_error("clone() failed: %m");
1279 const char *home = NULL;
1280 uid_t uid = (uid_t) -1;
1281 gid_t gid = (gid_t) -1;
1283 const char *envp[] = {
1284 "PATH=" DEFAULT_PATH_SPLIT_USR,
1285 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1290 NULL, /* container_uuid */
1291 NULL, /* LISTEN_FDS */
1292 NULL, /* LISTEN_PID */
1298 envp[n_env] = strv_find_prefix(environ, "TERM=");
1302 close_nointr_nofail(master);
1305 close_nointr(STDIN_FILENO);
1306 close_nointr(STDOUT_FILENO);
1307 close_nointr(STDERR_FILENO);
1309 close_nointr_nofail(kmsg_socket_pair[0]);
1310 kmsg_socket_pair[0] = -1;
1312 reset_all_signal_handlers();
1314 assert_se(sigemptyset(&mask) == 0);
1315 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1317 k = open_terminal(console, O_RDWR);
1318 if (k != STDIN_FILENO) {
1320 close_nointr_nofail(k);
1324 log_error("Failed to open console: %s", strerror(-k));
1328 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1329 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1330 log_error("Failed to duplicate console: %m");
1335 log_error("setsid() failed: %m");
1339 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1340 log_error("PR_SET_PDEATHSIG failed: %m");
1344 /* Mark everything as slave, so that we still
1345 * receive mounts from the real root, but don't
1346 * propagate mounts to the real root. */
1347 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1348 log_error("MS_SLAVE|MS_REC failed: %m");
1352 /* Turn directory into bind mount */
1353 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1354 log_error("Failed to make bind mount.");
1359 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1360 log_error("Failed to make read-only.");
1364 if (mount_all(arg_directory) < 0)
1367 if (copy_devnodes(arg_directory) < 0)
1370 if (setup_ptmx(arg_directory) < 0)
1373 dev_setup(arg_directory);
1375 if (setup_dev_console(arg_directory, console) < 0)
1378 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1381 close_nointr_nofail(kmsg_socket_pair[1]);
1382 kmsg_socket_pair[1] = -1;
1384 if (setup_boot_id(arg_directory) < 0)
1387 if (setup_timezone(arg_directory) < 0)
1390 if (setup_resolv_conf(arg_directory) < 0)
1393 if (setup_journal(arg_directory) < 0)
1396 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1399 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1402 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1405 if (chdir(arg_directory) < 0) {
1406 log_error("chdir(%s) failed: %m", arg_directory);
1410 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1411 log_error("mount(MS_MOVE) failed: %m");
1415 if (chroot(".") < 0) {
1416 log_error("chroot() failed: %m");
1420 if (chdir("/") < 0) {
1421 log_error("chdir() failed: %m");
1429 if (drop_capabilities() < 0) {
1430 log_error("drop_capabilities() failed: %m");
1436 /* Note that this resolves user names
1437 * inside the container, and hence
1438 * accesses the NSS modules from the
1439 * container and not the host. This is
1442 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1443 log_error("get_user_creds() failed: %m");
1447 if (mkdir_parents_label(home, 0775) < 0) {
1448 log_error("mkdir_parents_label() failed: %m");
1452 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1453 log_error("mkdir_safe_label() failed: %m");
1457 if (initgroups((const char*)arg_user, gid) < 0) {
1458 log_error("initgroups() failed: %m");
1462 if (setresgid(gid, gid, gid) < 0) {
1463 log_error("setregid() failed: %m");
1467 if (setresuid(uid, uid, uid) < 0) {
1468 log_error("setreuid() failed: %m");
1472 /* Reset everything fully to 0, just in case */
1474 if (setgroups(0, NULL) < 0) {
1475 log_error("setgroups() failed: %m");
1479 if (setresgid(0, 0, 0) < 0) {
1480 log_error("setregid() failed: %m");
1484 if (setresuid(0, 0, 0) < 0) {
1485 log_error("setreuid() failed: %m");
1490 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1491 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1492 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1497 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1498 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1504 if (fdset_size(fds) > 0) {
1505 k = fdset_cloexec(fds, false);
1507 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1511 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1512 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1520 eventfd_read(sync_fd, &x);
1521 close_nointr_nofail(sync_fd);
1524 if (!strv_isempty(arg_setenv)) {
1527 n = strv_env_merge(2, envp, arg_setenv);
1535 env_use = (char**) envp;
1538 if (arg_process_label)
1539 if (setexeccon(arg_process_label) < 0)
1540 log_error("setexeccon(\"%s\") failed: %m", arg_process_label);
1546 /* Automatically search for the init system */
1548 l = 1 + argc - optind;
1549 a = newa(char*, l + 1);
1550 memcpy(a + 1, argv + optind, l * sizeof(char*));
1552 a[0] = (char*) "/usr/lib/systemd/systemd";
1553 execve(a[0], a, env_use);
1555 a[0] = (char*) "/lib/systemd/systemd";
1556 execve(a[0], a, env_use);
1558 a[0] = (char*) "/sbin/init";
1559 execve(a[0], a, env_use);
1560 } else if (argc > optind)
1561 execvpe(argv[optind], argv + optind, env_use);
1563 chdir(home ? home : "/root");
1564 execle("/bin/bash", "-bash", NULL, env_use);
1567 log_error("execv() failed: %m");
1570 _exit(EXIT_FAILURE);
1576 r = register_machine(pid);
1580 eventfd_write(sync_fd, 1);
1581 close_nointr_nofail(sync_fd);
1584 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1593 /* Kill if it is not dead yet anyway */
1594 terminate_machine(pid);
1596 /* Redundant, but better safe than sorry */
1599 k = wait_for_terminate(pid, &status);
1607 if (status.si_code == CLD_EXITED) {
1608 r = status.si_status;
1609 if (status.si_status != 0) {
1610 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1615 log_debug("Container %s exited successfully.", arg_machine);
1617 } else if (status.si_code == CLD_KILLED &&
1618 status.si_status == SIGINT) {
1621 log_info("Container %s has been shut down.", arg_machine);
1624 } else if (status.si_code == CLD_KILLED &&
1625 status.si_status == SIGHUP) {
1628 log_info("Container %s is being rebooted.", arg_machine);
1630 } else if (status.si_code == CLD_KILLED ||
1631 status.si_code == CLD_DUMPED) {
1633 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1637 log_error("Container %s failed due to unknown reason.", arg_machine);
1647 free(arg_directory);