1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <selinux/selinux.h>
48 #include "sd-daemon.h"
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
66 #include "bus-error.h"
68 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *process_label = NULL;
84 static char *file_label = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
112 (1ULL << CAP_SYS_RESOURCE) |
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
115 (1ULL << CAP_AUDIT_CONTROL) |
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
121 static int help(void) {
123 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
124 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
125 " -h --help Show this help\n"
126 " --version Print version string\n"
127 " -D --directory=NAME Root directory for the container\n"
128 " -b --boot Boot up full system (i.e. invoke init)\n"
129 " -u --user=USER Run the command under specified user or uid\n"
130 " --uuid=UUID Set a specific machine UUID for the container\n"
131 " -M --machine=NAME Set the machine name for the container\n"
132 " -S --slice=SLICE Place the container in the specified slice\n"
133 " -L --file-label=LABEL Set the MAC file label to be used by tmpfs file systems in container\n"
134 " -Z --process-label=LABEL Set the MAC label to be used by processes in container\n"
135 " --private-network Disable network in container\n"
136 " --read-only Mount the root directory read-only\n"
137 " --capability=CAP In addition to the default, retain specified\n"
139 " --drop-capability=CAP Drop the specified capability from the default set\n"
140 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
141 " -j Equivalent to --link-journal=host\n"
142 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
144 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
145 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n",
146 program_invocation_short_name);
151 static int parse_argv(int argc, char *argv[]) {
166 static const struct option options[] = {
167 { "help", no_argument, NULL, 'h' },
168 { "version", no_argument, NULL, ARG_VERSION },
169 { "directory", required_argument, NULL, 'D' },
170 { "user", required_argument, NULL, 'u' },
171 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
172 { "boot", no_argument, NULL, 'b' },
173 { "uuid", required_argument, NULL, ARG_UUID },
174 { "read-only", no_argument, NULL, ARG_READ_ONLY },
175 { "capability", required_argument, NULL, ARG_CAPABILITY },
176 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
177 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
178 { "bind", required_argument, NULL, ARG_BIND },
179 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
180 { "machine", required_argument, NULL, 'M' },
181 { "slice", required_argument, NULL, 'S' },
182 { "setenv", required_argument, NULL, ARG_SETENV },
183 { "process-label", required_argument, NULL, 'Z' },
184 { "file-label", required_argument, NULL, 'L' },
193 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:", options, NULL)) >= 0) {
201 puts(PACKAGE_STRING);
202 puts(SYSTEMD_FEATURES);
207 arg_directory = canonicalize_file_name(optarg);
208 if (!arg_directory) {
209 log_error("Invalid root directory: %m");
217 arg_user = strdup(optarg);
223 case ARG_PRIVATE_NETWORK:
224 arg_private_network = true;
232 r = sd_id128_from_string(optarg, &arg_uuid);
234 log_error("Invalid UUID: %s", optarg);
240 arg_slice = strdup(optarg);
247 if (!hostname_is_valid(optarg)) {
248 log_error("Invalid machine name: %s", optarg);
253 arg_machine = strdup(optarg);
260 file_label = strdup(optarg);
267 process_label = strdup(optarg);
274 arg_read_only = true;
278 case ARG_DROP_CAPABILITY: {
282 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
286 t = strndup(word, length);
290 if (cap_from_name(t, &cap) < 0) {
291 log_error("Failed to parse capability %s.", t);
298 if (c == ARG_CAPABILITY)
299 arg_retain |= 1ULL << (uint64_t) cap;
301 arg_retain &= ~(1ULL << (uint64_t) cap);
308 arg_link_journal = LINK_GUEST;
311 case ARG_LINK_JOURNAL:
312 if (streq(optarg, "auto"))
313 arg_link_journal = LINK_AUTO;
314 else if (streq(optarg, "no"))
315 arg_link_journal = LINK_NO;
316 else if (streq(optarg, "guest"))
317 arg_link_journal = LINK_GUEST;
318 else if (streq(optarg, "host"))
319 arg_link_journal = LINK_HOST;
321 log_error("Failed to parse link journal mode %s", optarg);
329 _cleanup_free_ char *a = NULL, *b = NULL;
333 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
335 e = strchr(optarg, ':');
337 a = strndup(optarg, e - optarg);
347 if (!path_is_absolute(a) || !path_is_absolute(b)) {
348 log_error("Invalid bind mount specification: %s", optarg);
352 r = strv_extend(x, a);
356 r = strv_extend(x, b);
366 if (!env_assignment_is_valid(optarg)) {
367 log_error("Environment variable assignment '%s' is not valid.", optarg);
371 n = strv_env_set(arg_setenv, optarg);
375 strv_free(arg_setenv);
384 assert_not_reached("Unhandled option");
391 static int mount_all(const char *dest) {
393 typedef struct MountPoint {
402 static const MountPoint mount_table[] = {
403 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
404 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
405 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
406 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
407 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
408 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
409 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
410 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
412 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
413 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
420 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
421 _cleanup_free_ char *where = NULL;
422 _cleanup_free_ char *options = NULL;
425 where = strjoin(dest, "/", mount_table[k].where, NULL);
429 t = path_is_mount_point(where, true);
431 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
439 /* Skip this entry if it is not a remount. */
440 if (mount_table[k].what && t > 0)
443 mkdir_p(where, 0755);
446 if (file_label && (streq_ptr(mount_table[k].what, "tmpfs") ||
447 streq_ptr(mount_table[k].what, "devpts")))
448 options = strjoin(mount_table[k].options, ",context=\"", file_label, "\"", NULL);
451 options = strjoin(mount_table[k].options, NULL);
456 if (mount(mount_table[k].what,
459 mount_table[k].flags,
461 mount_table[k].fatal) {
463 log_error("mount(%s) failed: %m", where);
473 static int mount_binds(const char *dest, char **l, unsigned long flags) {
476 STRV_FOREACH_PAIR(x, y, l) {
478 struct stat source_st, dest_st;
481 if (stat(*x, &source_st) < 0) {
482 log_error("failed to stat %s: %m", *x);
486 where = strappenda(dest, *y);
487 r = stat(where, &dest_st);
489 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
490 log_error("The file types of %s and %s do not match. Refusing bind mount",
494 } else if (errno == ENOENT) {
495 r = mkdir_parents_label(where, 0755);
497 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
501 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
504 /* Create the mount point, but be conservative -- refuse to create block
505 * and char devices. */
506 if (S_ISDIR(source_st.st_mode))
507 mkdir_label(where, 0755);
508 else if (S_ISFIFO(source_st.st_mode))
510 else if (S_ISSOCK(source_st.st_mode))
511 mknod(where, 0644 | S_IFSOCK, 0);
512 else if (S_ISREG(source_st.st_mode))
515 log_error("Refusing to create mountpoint for file: %s", *x);
519 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
520 log_error("mount(%s) failed: %m", where);
524 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
525 log_error("mount(%s) failed: %m", where);
533 static int setup_timezone(const char *dest) {
534 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
540 /* Fix the timezone, if possible */
541 r = readlink_malloc("/etc/localtime", &p);
543 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
547 z = path_startswith(p, "../usr/share/zoneinfo/");
549 z = path_startswith(p, "/usr/share/zoneinfo/");
551 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
555 where = strappend(dest, "/etc/localtime");
559 r = readlink_malloc(where, &q);
561 y = path_startswith(q, "../usr/share/zoneinfo/");
563 y = path_startswith(q, "/usr/share/zoneinfo/");
566 /* Already pointing to the right place? Then do nothing .. */
567 if (y && streq(y, z))
571 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
575 if (access(check, F_OK) < 0) {
576 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
580 what = strappend("../usr/share/zoneinfo/", z);
585 if (symlink(what, where) < 0) {
586 log_error("Failed to correct timezone of container: %m");
593 static int setup_resolv_conf(const char *dest) {
594 char _cleanup_free_ *where = NULL;
598 if (arg_private_network)
601 /* Fix resolv.conf, if possible */
602 where = strappend(dest, "/etc/resolv.conf");
606 /* We don't really care for the results of this really. If it
607 * fails, it fails, but meh... */
608 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
613 static int setup_boot_id(const char *dest) {
614 _cleanup_free_ char *from = NULL, *to = NULL;
621 /* Generate a new randomized boot ID, so that each boot-up of
622 * the container gets a new one */
624 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
625 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
629 r = sd_id128_randomize(&rnd);
631 log_error("Failed to generate random boot id: %s", strerror(-r));
635 snprintf(as_uuid, sizeof(as_uuid),
636 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
637 SD_ID128_FORMAT_VAL(rnd));
638 char_array_0(as_uuid);
640 r = write_string_file(from, as_uuid);
642 log_error("Failed to write boot id: %s", strerror(-r));
646 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
647 log_error("Failed to bind mount boot id: %m");
649 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
650 log_warning("Failed to make boot id read-only: %m");
656 static int copy_devnodes(const char *dest) {
658 static const char devnodes[] =
668 _cleanup_umask_ mode_t u;
674 NULSTR_FOREACH(d, devnodes) {
675 _cleanup_free_ char *from = NULL, *to = NULL;
678 from = strappend("/dev/", d);
679 to = strjoin(dest, "/dev/", d, NULL);
683 if (stat(from, &st) < 0) {
685 if (errno != ENOENT) {
686 log_error("Failed to stat %s: %m", from);
690 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
692 log_error("%s is not a char or block device, cannot copy", from);
695 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
697 log_error("mknod(%s) failed: %m", dest);
705 static int setup_ptmx(const char *dest) {
706 _cleanup_free_ char *p = NULL;
708 p = strappend(dest, "/dev/ptmx");
712 if (symlink("pts/ptmx", p) < 0) {
713 log_error("Failed to create /dev/ptmx symlink: %m");
720 static int setup_dev_console(const char *dest, const char *console) {
722 _cleanup_free_ char *to = NULL;
724 _cleanup_umask_ mode_t u;
731 if (stat(console, &st) < 0) {
732 log_error("Failed to stat %s: %m", console);
735 } else if (!S_ISCHR(st.st_mode)) {
736 log_error("/dev/console is not a char device");
740 r = chmod_and_chown(console, 0600, 0, 0);
742 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
746 if (asprintf(&to, "%s/dev/console", dest) < 0)
749 /* We need to bind mount the right tty to /dev/console since
750 * ptys can only exist on pts file systems. To have something
751 * to bind mount things on we create a device node first, that
752 * has the right major/minor (note that the major minor
753 * doesn't actually matter here, since we mount it over
756 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
757 log_error("mknod() for /dev/console failed: %m");
761 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
762 log_error("Bind mount for /dev/console failed: %m");
769 static int setup_kmsg(const char *dest, int kmsg_socket) {
770 _cleanup_free_ char *from = NULL, *to = NULL;
772 _cleanup_umask_ mode_t u;
774 struct cmsghdr cmsghdr;
775 uint8_t buf[CMSG_SPACE(sizeof(int))];
778 .msg_control = &control,
779 .msg_controllen = sizeof(control),
781 struct cmsghdr *cmsg;
784 assert(kmsg_socket >= 0);
788 /* We create the kmsg FIFO as /dev/kmsg, but immediately
789 * delete it after bind mounting it to /proc/kmsg. While FIFOs
790 * on the reading side behave very similar to /proc/kmsg,
791 * their writing side behaves differently from /dev/kmsg in
792 * that writing blocks when nothing is reading. In order to
793 * avoid any problems with containers deadlocking due to this
794 * we simply make /dev/kmsg unavailable to the container. */
795 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
796 asprintf(&to, "%s/proc/kmsg", dest) < 0)
799 if (mkfifo(from, 0600) < 0) {
800 log_error("mkfifo() for /dev/kmsg failed: %m");
804 r = chmod_and_chown(from, 0600, 0, 0);
806 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
810 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
811 log_error("Bind mount for /proc/kmsg failed: %m");
815 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
817 log_error("Failed to open fifo: %m");
821 cmsg = CMSG_FIRSTHDR(&mh);
822 cmsg->cmsg_level = SOL_SOCKET;
823 cmsg->cmsg_type = SCM_RIGHTS;
824 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
825 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
827 mh.msg_controllen = cmsg->cmsg_len;
829 /* Store away the fd in the socket, so that it stays open as
830 * long as we run the child */
831 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
832 close_nointr_nofail(fd);
835 log_error("Failed to send FIFO fd: %m");
839 /* And now make the FIFO unavailable as /dev/kmsg... */
844 static int setup_hostname(void) {
846 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
852 static int setup_journal(const char *directory) {
853 sd_id128_t machine_id, this_id;
854 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
858 p = strappend(directory, "/etc/machine-id");
862 r = read_one_line_file(p, &b);
863 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
866 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
871 if (isempty(id) && arg_link_journal == LINK_AUTO)
874 /* Verify validity */
875 r = sd_id128_from_string(id, &machine_id);
877 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
881 r = sd_id128_get_machine(&this_id);
883 log_error("Failed to retrieve machine ID: %s", strerror(-r));
887 if (sd_id128_equal(machine_id, this_id)) {
888 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
889 "Host and machine ids are equal (%s): refusing to link journals", id);
890 if (arg_link_journal == LINK_AUTO)
896 if (arg_link_journal == LINK_NO)
900 p = strappend("/var/log/journal/", id);
901 q = strjoin(directory, "/var/log/journal/", id, NULL);
905 if (path_is_mount_point(p, false) > 0) {
906 if (arg_link_journal != LINK_AUTO) {
907 log_error("%s: already a mount point, refusing to use for journal", p);
914 if (path_is_mount_point(q, false) > 0) {
915 if (arg_link_journal != LINK_AUTO) {
916 log_error("%s: already a mount point, refusing to use for journal", q);
923 r = readlink_and_make_absolute(p, &d);
925 if ((arg_link_journal == LINK_GUEST ||
926 arg_link_journal == LINK_AUTO) &&
929 r = mkdir_p(q, 0755);
931 log_warning("failed to create directory %s: %m", q);
936 log_error("Failed to remove symlink %s: %m", p);
939 } else if (r == -EINVAL) {
941 if (arg_link_journal == LINK_GUEST &&
944 if (errno == ENOTDIR) {
945 log_error("%s already exists and is neither a symlink nor a directory", p);
948 log_error("Failed to remove %s: %m", p);
952 } else if (r != -ENOENT) {
953 log_error("readlink(%s) failed: %m", p);
957 if (arg_link_journal == LINK_GUEST) {
959 if (symlink(q, p) < 0) {
960 log_error("Failed to symlink %s to %s: %m", q, p);
964 r = mkdir_p(q, 0755);
966 log_warning("failed to create directory %s: %m", q);
970 if (arg_link_journal == LINK_HOST) {
971 r = mkdir_p(p, 0755);
973 log_error("Failed to create %s: %m", p);
977 } else if (access(p, F_OK) < 0)
980 if (dir_is_empty(q) == 0) {
981 log_error("%s not empty.", q);
985 r = mkdir_p(q, 0755);
987 log_error("Failed to create %s: %m", q);
991 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
992 log_error("Failed to bind mount journal from host into guest: %m");
999 static int setup_kdbus(const char *dest, const char *path) {
1005 p = strappenda(dest, "/dev/kdbus");
1006 if (mkdir(p, 0755) < 0) {
1007 log_error("Failed to create kdbus path: %m");
1011 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1012 log_error("Failed to mount kdbus domain path: %m");
1019 static int drop_capabilities(void) {
1020 return capability_bounding_set_drop(~arg_retain, false);
1023 static int register_machine(pid_t pid) {
1024 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1025 _cleanup_bus_unref_ sd_bus *bus = NULL;
1028 r = sd_bus_open_system(&bus);
1030 log_error("Failed to open system bus: %s", strerror(-r));
1034 r = sd_bus_call_method(
1036 "org.freedesktop.machine1",
1037 "/org/freedesktop/machine1",
1038 "org.freedesktop.machine1.Manager",
1044 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1048 strempty(arg_directory),
1049 !isempty(arg_slice), "Slice", "s", arg_slice);
1051 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1058 static int terminate_machine(pid_t pid) {
1059 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1060 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1061 _cleanup_bus_unref_ sd_bus *bus = NULL;
1065 r = sd_bus_default_system(&bus);
1067 log_error("Failed to open system bus: %s", strerror(-r));
1071 r = sd_bus_call_method(
1073 "org.freedesktop.machine1",
1074 "/org/freedesktop/machine1",
1075 "org.freedesktop.machine1.Manager",
1082 /* Note that the machine might already have been
1083 * cleaned up automatically, hence don't consider it a
1084 * failure if we cannot get the machine object. */
1085 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1089 r = sd_bus_message_read(reply, "o", &path);
1091 return bus_log_parse_error(r);
1093 r = sd_bus_call_method(
1095 "org.freedesktop.machine1",
1097 "org.freedesktop.machine1.Machine",
1103 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1110 static bool audit_enabled(void) {
1113 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1115 close_nointr_nofail(fd);
1121 int main(int argc, char *argv[]) {
1123 int r = EXIT_FAILURE, k;
1124 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1126 const char *console = NULL;
1128 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1129 _cleanup_fdset_free_ FDSet *fds = NULL;
1130 _cleanup_free_ char *kdbus_domain = NULL;
1133 log_parse_environment();
1136 k = parse_argv(argc, argv);
1144 if (arg_directory) {
1147 p = path_make_absolute_cwd(arg_directory);
1148 free(arg_directory);
1151 arg_directory = get_current_dir_name();
1153 if (!arg_directory) {
1154 log_error("Failed to determine path, please use -D.");
1158 path_kill_slashes(arg_directory);
1161 arg_machine = strdup(basename(arg_directory));
1167 hostname_cleanup(arg_machine, false);
1168 if (isempty(arg_machine)) {
1169 log_error("Failed to determine machine name automatically, please use -M.");
1174 if (geteuid() != 0) {
1175 log_error("Need to be root.");
1179 if (sd_booted() <= 0) {
1180 log_error("Not running on a systemd system.");
1184 if (arg_boot && audit_enabled()) {
1185 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1186 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1187 "line before using systemd-nspawn. Sleeping for 5s...\n");
1191 if (path_equal(arg_directory, "/")) {
1192 log_error("Spawning container on root directory not supported.");
1196 if (path_is_os_tree(arg_directory) <= 0) {
1197 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1202 n_fd_passed = sd_listen_fds(false);
1203 if (n_fd_passed > 0) {
1204 k = fdset_new_listen_fds(&fds, false);
1206 log_error("Failed to collect file descriptors: %s", strerror(-k));
1210 fdset_close_others(fds);
1213 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1215 log_error("Failed to acquire pseudo tty: %m");
1219 console = ptsname(master);
1221 log_error("Failed to determine tty name: %m");
1225 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1227 if (unlockpt(master) < 0) {
1228 log_error("Failed to unlock tty: %m");
1232 ns = strappenda("machine-", arg_machine);
1233 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1235 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1237 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1239 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1240 log_error("Failed to create kmsg socket pair: %m");
1244 sd_notify(0, "READY=1");
1246 assert_se(sigemptyset(&mask) == 0);
1247 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1248 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1253 sync_fd = eventfd(0, EFD_CLOEXEC);
1255 log_error("Failed to create event fd: %m");
1259 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1261 if (errno == EINVAL)
1262 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1264 log_error("clone() failed: %m");
1271 const char *home = NULL;
1272 uid_t uid = (uid_t) -1;
1273 gid_t gid = (gid_t) -1;
1275 const char *envp[] = {
1276 "PATH=" DEFAULT_PATH_SPLIT_USR,
1277 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1282 NULL, /* container_uuid */
1283 NULL, /* LISTEN_FDS */
1284 NULL, /* LISTEN_PID */
1290 envp[n_env] = strv_find_prefix(environ, "TERM=");
1294 close_nointr_nofail(master);
1297 close_nointr(STDIN_FILENO);
1298 close_nointr(STDOUT_FILENO);
1299 close_nointr(STDERR_FILENO);
1301 close_nointr_nofail(kmsg_socket_pair[0]);
1302 kmsg_socket_pair[0] = -1;
1304 reset_all_signal_handlers();
1306 assert_se(sigemptyset(&mask) == 0);
1307 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1309 k = open_terminal(console, O_RDWR);
1310 if (k != STDIN_FILENO) {
1312 close_nointr_nofail(k);
1316 log_error("Failed to open console: %s", strerror(-k));
1320 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1321 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1322 log_error("Failed to duplicate console: %m");
1327 log_error("setsid() failed: %m");
1331 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1332 log_error("PR_SET_PDEATHSIG failed: %m");
1336 /* Mark everything as slave, so that we still
1337 * receive mounts from the real root, but don't
1338 * propagate mounts to the real root. */
1339 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1340 log_error("MS_SLAVE|MS_REC failed: %m");
1344 /* Turn directory into bind mount */
1345 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1346 log_error("Failed to make bind mount.");
1351 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1352 log_error("Failed to make read-only.");
1356 if (mount_all(arg_directory) < 0)
1359 if (copy_devnodes(arg_directory) < 0)
1362 if (setup_ptmx(arg_directory) < 0)
1365 dev_setup(arg_directory);
1367 if (setup_dev_console(arg_directory, console) < 0)
1370 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1373 close_nointr_nofail(kmsg_socket_pair[1]);
1374 kmsg_socket_pair[1] = -1;
1376 if (setup_boot_id(arg_directory) < 0)
1379 if (setup_timezone(arg_directory) < 0)
1382 if (setup_resolv_conf(arg_directory) < 0)
1385 if (setup_journal(arg_directory) < 0)
1388 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1391 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1394 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1397 if (chdir(arg_directory) < 0) {
1398 log_error("chdir(%s) failed: %m", arg_directory);
1402 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1403 log_error("mount(MS_MOVE) failed: %m");
1407 if (chroot(".") < 0) {
1408 log_error("chroot() failed: %m");
1412 if (chdir("/") < 0) {
1413 log_error("chdir() failed: %m");
1421 if (drop_capabilities() < 0) {
1422 log_error("drop_capabilities() failed: %m");
1428 /* Note that this resolves user names
1429 * inside the container, and hence
1430 * accesses the NSS modules from the
1431 * container and not the host. This is
1434 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1435 log_error("get_user_creds() failed: %m");
1439 if (mkdir_parents_label(home, 0775) < 0) {
1440 log_error("mkdir_parents_label() failed: %m");
1444 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1445 log_error("mkdir_safe_label() failed: %m");
1449 if (initgroups((const char*)arg_user, gid) < 0) {
1450 log_error("initgroups() failed: %m");
1454 if (setresgid(gid, gid, gid) < 0) {
1455 log_error("setregid() failed: %m");
1459 if (setresuid(uid, uid, uid) < 0) {
1460 log_error("setreuid() failed: %m");
1464 /* Reset everything fully to 0, just in case */
1466 if (setgroups(0, NULL) < 0) {
1467 log_error("setgroups() failed: %m");
1471 if (setresgid(0, 0, 0) < 0) {
1472 log_error("setregid() failed: %m");
1476 if (setresuid(0, 0, 0) < 0) {
1477 log_error("setreuid() failed: %m");
1482 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1483 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1484 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1489 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1490 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1496 if (fdset_size(fds) > 0) {
1497 k = fdset_cloexec(fds, false);
1499 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1503 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1504 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1512 eventfd_read(sync_fd, &x);
1513 close_nointr_nofail(sync_fd);
1516 if (!strv_isempty(arg_setenv)) {
1519 n = strv_env_merge(2, envp, arg_setenv);
1527 env_use = (char**) envp;
1531 if (setexeccon(process_label) < 0)
1532 log_error("setexeccon(\"%s\") failed: %m", process_label);
1538 /* Automatically search for the init system */
1540 l = 1 + argc - optind;
1541 a = newa(char*, l + 1);
1542 memcpy(a + 1, argv + optind, l * sizeof(char*));
1544 a[0] = (char*) "/usr/lib/systemd/systemd";
1545 execve(a[0], a, env_use);
1547 a[0] = (char*) "/lib/systemd/systemd";
1548 execve(a[0], a, env_use);
1550 a[0] = (char*) "/sbin/init";
1551 execve(a[0], a, env_use);
1552 } else if (argc > optind)
1553 execvpe(argv[optind], argv + optind, env_use);
1555 chdir(home ? home : "/root");
1556 execle("/bin/bash", "-bash", NULL, env_use);
1559 log_error("execv() failed: %m");
1562 _exit(EXIT_FAILURE);
1568 r = register_machine(pid);
1572 eventfd_write(sync_fd, 1);
1573 close_nointr_nofail(sync_fd);
1576 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1584 /* Kill if it is not dead yet anyway */
1585 terminate_machine(pid);
1587 /* Redundant, but better safe than sorry */
1590 k = wait_for_terminate(pid, &status);
1598 if (status.si_code == CLD_EXITED) {
1599 r = status.si_status;
1600 if (status.si_status != 0) {
1601 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1605 log_debug("Container %s exited successfully.", arg_machine);
1607 } else if (status.si_code == CLD_KILLED &&
1608 status.si_status == SIGINT) {
1609 log_info("Container %s has been shut down.", arg_machine);
1612 } else if (status.si_code == CLD_KILLED &&
1613 status.si_status == SIGHUP) {
1614 log_info("Container %s is being rebooted.", arg_machine);
1616 } else if (status.si_code == CLD_KILLED ||
1617 status.si_code == CLD_DUMPED) {
1619 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1623 log_error("Container %s failed due to unknown reason.", arg_machine);
1633 free(arg_directory);