1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include "sd-daemon.h"
54 #include "cgroup-util.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
58 #include "dev-setup.h"
63 #include "bus-error.h"
65 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static const char *arg_slice = NULL;
84 static bool arg_private_network = false;
85 static bool arg_read_only = false;
86 static bool arg_boot = false;
87 static LinkJournal arg_link_journal = LINK_AUTO;
88 static uint64_t arg_retain =
90 (1ULL << CAP_DAC_OVERRIDE) |
91 (1ULL << CAP_DAC_READ_SEARCH) |
92 (1ULL << CAP_FOWNER) |
93 (1ULL << CAP_FSETID) |
94 (1ULL << CAP_IPC_OWNER) |
97 (1ULL << CAP_LINUX_IMMUTABLE) |
98 (1ULL << CAP_NET_BIND_SERVICE) |
99 (1ULL << CAP_NET_BROADCAST) |
100 (1ULL << CAP_NET_RAW) |
101 (1ULL << CAP_SETGID) |
102 (1ULL << CAP_SETFCAP) |
103 (1ULL << CAP_SETPCAP) |
104 (1ULL << CAP_SETUID) |
105 (1ULL << CAP_SYS_ADMIN) |
106 (1ULL << CAP_SYS_CHROOT) |
107 (1ULL << CAP_SYS_NICE) |
108 (1ULL << CAP_SYS_PTRACE) |
109 (1ULL << CAP_SYS_TTY_CONFIG) |
110 (1ULL << CAP_SYS_RESOURCE) |
111 (1ULL << CAP_SYS_BOOT) |
112 (1ULL << CAP_AUDIT_WRITE) |
113 (1ULL << CAP_AUDIT_CONTROL);
114 static char **arg_bind = NULL;
115 static char **arg_bind_ro = NULL;
116 static char **arg_setenv = NULL;
118 static int help(void) {
120 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
121 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
122 " -h --help Show this help\n"
123 " --version Print version string\n"
124 " -D --directory=NAME Root directory for the container\n"
125 " -b --boot Boot up full system (i.e. invoke init)\n"
126 " -u --user=USER Run the command under specified user or uid\n"
127 " --uuid=UUID Set a specific machine UUID for the container\n"
128 " -M --machine=NAME Set the machine name for the container\n"
129 " -S --slice=SLICE Place the container in the specified slice\n"
130 " --private-network Disable network in container\n"
131 " --read-only Mount the root directory read-only\n"
132 " --capability=CAP In addition to the default, retain specified\n"
134 " --drop-capability=CAP Drop the specified capability from the default set\n"
135 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
136 " -j Equivalent to --link-journal=host\n"
137 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
139 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
140 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n",
141 program_invocation_short_name);
146 static int parse_argv(int argc, char *argv[]) {
161 static const struct option options[] = {
162 { "help", no_argument, NULL, 'h' },
163 { "version", no_argument, NULL, ARG_VERSION },
164 { "directory", required_argument, NULL, 'D' },
165 { "user", required_argument, NULL, 'u' },
166 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
167 { "boot", no_argument, NULL, 'b' },
168 { "uuid", required_argument, NULL, ARG_UUID },
169 { "read-only", no_argument, NULL, ARG_READ_ONLY },
170 { "capability", required_argument, NULL, ARG_CAPABILITY },
171 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
172 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
173 { "bind", required_argument, NULL, ARG_BIND },
174 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
175 { "machine", required_argument, NULL, 'M' },
176 { "slice", required_argument, NULL, 'S' },
177 { "setenv", required_argument, NULL, ARG_SETENV },
186 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
194 puts(PACKAGE_STRING);
195 puts(SYSTEMD_FEATURES);
200 arg_directory = canonicalize_file_name(optarg);
201 if (!arg_directory) {
202 log_error("Invalid root directory: %m");
210 arg_user = strdup(optarg);
216 case ARG_PRIVATE_NETWORK:
217 arg_private_network = true;
225 r = sd_id128_from_string(optarg, &arg_uuid);
227 log_error("Invalid UUID: %s", optarg);
233 arg_slice = strdup(optarg);
240 if (!hostname_is_valid(optarg)) {
241 log_error("Invalid machine name: %s", optarg);
246 arg_machine = strdup(optarg);
253 arg_read_only = true;
257 case ARG_DROP_CAPABILITY: {
261 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
265 t = strndup(word, length);
269 if (cap_from_name(t, &cap) < 0) {
270 log_error("Failed to parse capability %s.", t);
277 if (c == ARG_CAPABILITY)
278 arg_retain |= 1ULL << (uint64_t) cap;
280 arg_retain &= ~(1ULL << (uint64_t) cap);
287 arg_link_journal = LINK_GUEST;
290 case ARG_LINK_JOURNAL:
291 if (streq(optarg, "auto"))
292 arg_link_journal = LINK_AUTO;
293 else if (streq(optarg, "no"))
294 arg_link_journal = LINK_NO;
295 else if (streq(optarg, "guest"))
296 arg_link_journal = LINK_GUEST;
297 else if (streq(optarg, "host"))
298 arg_link_journal = LINK_HOST;
300 log_error("Failed to parse link journal mode %s", optarg);
308 _cleanup_free_ char *a = NULL, *b = NULL;
312 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
314 e = strchr(optarg, ':');
316 a = strndup(optarg, e - optarg);
326 if (!path_is_absolute(a) || !path_is_absolute(b)) {
327 log_error("Invalid bind mount specification: %s", optarg);
331 r = strv_extend(x, a);
335 r = strv_extend(x, b);
345 if (!env_assignment_is_valid(optarg)) {
346 log_error("Environment variable assignment '%s' is not valid.", optarg);
350 n = strv_env_set(arg_setenv, optarg);
354 strv_free(arg_setenv);
363 assert_not_reached("Unhandled option");
370 static int mount_all(const char *dest) {
372 typedef struct MountPoint {
381 static const MountPoint mount_table[] = {
382 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
383 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
384 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
385 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
386 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
387 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
388 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
389 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
391 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
392 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
399 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
400 _cleanup_free_ char *where = NULL;
403 where = strjoin(dest, "/", mount_table[k].where, NULL);
407 t = path_is_mount_point(where, true);
409 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
417 /* Skip this entry if it is not a remount. */
418 if (mount_table[k].what && t > 0)
421 mkdir_p(where, 0755);
423 if (mount(mount_table[k].what,
426 mount_table[k].flags,
427 mount_table[k].options) < 0 &&
428 mount_table[k].fatal) {
430 log_error("mount(%s) failed: %m", where);
440 static int mount_binds(const char *dest, char **l, unsigned long flags) {
443 STRV_FOREACH_PAIR(x, y, l) {
445 struct stat source_st, dest_st;
448 if (stat(*x, &source_st) < 0) {
449 log_error("failed to stat %s: %m", *x);
453 where = strappenda(dest, *y);
454 r = stat(where, &dest_st);
456 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
457 log_error("The file types of %s and %s do not match. Refusing bind mount",
461 } else if (errno == ENOENT) {
462 r = mkdir_parents_label(where, 0755);
464 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
468 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
471 /* Create the mount point, but be conservative -- refuse to create block
472 * and char devices. */
473 if (S_ISDIR(source_st.st_mode))
474 mkdir_label(where, 0755);
475 else if (S_ISFIFO(source_st.st_mode))
477 else if (S_ISSOCK(source_st.st_mode))
478 mknod(where, 0644 | S_IFSOCK, 0);
479 else if (S_ISREG(source_st.st_mode))
482 log_error("Refusing to create mountpoint for file: %s", *x);
486 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
487 log_error("mount(%s) failed: %m", where);
491 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
492 log_error("mount(%s) failed: %m", where);
500 static int setup_timezone(const char *dest) {
501 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
507 /* Fix the timezone, if possible */
508 r = readlink_malloc("/etc/localtime", &p);
510 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
514 z = path_startswith(p, "../usr/share/zoneinfo/");
516 z = path_startswith(p, "/usr/share/zoneinfo/");
518 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
522 where = strappend(dest, "/etc/localtime");
526 r = readlink_malloc(where, &q);
528 y = path_startswith(q, "../usr/share/zoneinfo/");
530 y = path_startswith(q, "/usr/share/zoneinfo/");
533 /* Already pointing to the right place? Then do nothing .. */
534 if (y && streq(y, z))
538 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
542 if (access(check, F_OK) < 0) {
543 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
547 what = strappend("../usr/share/zoneinfo/", z);
552 if (symlink(what, where) < 0) {
553 log_error("Failed to correct timezone of container: %m");
560 static int setup_resolv_conf(const char *dest) {
561 char _cleanup_free_ *where = NULL;
565 if (arg_private_network)
568 /* Fix resolv.conf, if possible */
569 where = strappend(dest, "/etc/resolv.conf");
573 /* We don't really care for the results of this really. If it
574 * fails, it fails, but meh... */
575 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
580 static int setup_boot_id(const char *dest) {
581 _cleanup_free_ char *from = NULL, *to = NULL;
588 /* Generate a new randomized boot ID, so that each boot-up of
589 * the container gets a new one */
591 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
592 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
596 r = sd_id128_randomize(&rnd);
598 log_error("Failed to generate random boot id: %s", strerror(-r));
602 snprintf(as_uuid, sizeof(as_uuid),
603 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
604 SD_ID128_FORMAT_VAL(rnd));
605 char_array_0(as_uuid);
607 r = write_string_file(from, as_uuid);
609 log_error("Failed to write boot id: %s", strerror(-r));
613 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
614 log_error("Failed to bind mount boot id: %m");
616 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
617 log_warning("Failed to make boot id read-only: %m");
623 static int copy_devnodes(const char *dest) {
625 static const char devnodes[] =
635 _cleanup_umask_ mode_t u;
641 NULSTR_FOREACH(d, devnodes) {
643 _cleanup_free_ char *from = NULL, *to = NULL;
645 asprintf(&from, "/dev/%s", d);
646 asprintf(&to, "%s/dev/%s", dest, d);
657 if (stat(from, &st) < 0) {
659 if (errno != ENOENT) {
660 log_error("Failed to stat %s: %m", from);
665 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
667 log_error("%s is not a char or block device, cannot copy", from);
671 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
673 log_error("mknod(%s) failed: %m", dest);
682 static int setup_ptmx(const char *dest) {
683 _cleanup_free_ char *p = NULL;
685 p = strappend(dest, "/dev/ptmx");
689 if (symlink("pts/ptmx", p) < 0) {
690 log_error("Failed to create /dev/ptmx symlink: %m");
697 static int setup_dev_console(const char *dest, const char *console) {
699 _cleanup_free_ char *to = NULL;
701 _cleanup_umask_ mode_t u;
708 if (stat(console, &st) < 0) {
709 log_error("Failed to stat %s: %m", console);
712 } else if (!S_ISCHR(st.st_mode)) {
713 log_error("/dev/console is not a char device");
717 r = chmod_and_chown(console, 0600, 0, 0);
719 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
723 if (asprintf(&to, "%s/dev/console", dest) < 0)
726 /* We need to bind mount the right tty to /dev/console since
727 * ptys can only exist on pts file systems. To have something
728 * to bind mount things on we create a device node first, that
729 * has the right major/minor (note that the major minor
730 * doesn't actually matter here, since we mount it over
733 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
734 log_error("mknod() for /dev/console failed: %m");
738 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
739 log_error("Bind mount for /dev/console failed: %m");
746 static int setup_kmsg(const char *dest, int kmsg_socket) {
747 _cleanup_free_ char *from = NULL, *to = NULL;
749 _cleanup_umask_ mode_t u;
751 struct cmsghdr cmsghdr;
752 uint8_t buf[CMSG_SPACE(sizeof(int))];
755 .msg_control = &control,
756 .msg_controllen = sizeof(control),
758 struct cmsghdr *cmsg;
761 assert(kmsg_socket >= 0);
765 /* We create the kmsg FIFO as /dev/kmsg, but immediately
766 * delete it after bind mounting it to /proc/kmsg. While FIFOs
767 * on the reading side behave very similar to /proc/kmsg,
768 * their writing side behaves differently from /dev/kmsg in
769 * that writing blocks when nothing is reading. In order to
770 * avoid any problems with containers deadlocking due to this
771 * we simply make /dev/kmsg unavailable to the container. */
772 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
773 asprintf(&to, "%s/proc/kmsg", dest) < 0)
776 if (mkfifo(from, 0600) < 0) {
777 log_error("mkfifo() for /dev/kmsg failed: %m");
781 r = chmod_and_chown(from, 0600, 0, 0);
783 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
787 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
788 log_error("Bind mount for /proc/kmsg failed: %m");
792 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
794 log_error("Failed to open fifo: %m");
798 cmsg = CMSG_FIRSTHDR(&mh);
799 cmsg->cmsg_level = SOL_SOCKET;
800 cmsg->cmsg_type = SCM_RIGHTS;
801 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
802 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
804 mh.msg_controllen = cmsg->cmsg_len;
806 /* Store away the fd in the socket, so that it stays open as
807 * long as we run the child */
808 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
809 close_nointr_nofail(fd);
812 log_error("Failed to send FIFO fd: %m");
816 /* And now make the FIFO unavailable as /dev/kmsg... */
821 static int setup_hostname(void) {
823 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
829 static int setup_journal(const char *directory) {
830 sd_id128_t machine_id, this_id;
831 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
835 p = strappend(directory, "/etc/machine-id");
839 r = read_one_line_file(p, &b);
840 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
843 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
848 if (isempty(id) && arg_link_journal == LINK_AUTO)
851 /* Verify validity */
852 r = sd_id128_from_string(id, &machine_id);
854 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
858 r = sd_id128_get_machine(&this_id);
860 log_error("Failed to retrieve machine ID: %s", strerror(-r));
864 if (sd_id128_equal(machine_id, this_id)) {
865 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
866 "Host and machine ids are equal (%s): refusing to link journals", id);
867 if (arg_link_journal == LINK_AUTO)
873 if (arg_link_journal == LINK_NO)
877 p = strappend("/var/log/journal/", id);
878 q = strjoin(directory, "/var/log/journal/", id, NULL);
882 if (path_is_mount_point(p, false) > 0) {
883 if (arg_link_journal != LINK_AUTO) {
884 log_error("%s: already a mount point, refusing to use for journal", p);
891 if (path_is_mount_point(q, false) > 0) {
892 if (arg_link_journal != LINK_AUTO) {
893 log_error("%s: already a mount point, refusing to use for journal", q);
900 r = readlink_and_make_absolute(p, &d);
902 if ((arg_link_journal == LINK_GUEST ||
903 arg_link_journal == LINK_AUTO) &&
906 r = mkdir_p(q, 0755);
908 log_warning("failed to create directory %s: %m", q);
913 log_error("Failed to remove symlink %s: %m", p);
916 } else if (r == -EINVAL) {
918 if (arg_link_journal == LINK_GUEST &&
921 if (errno == ENOTDIR) {
922 log_error("%s already exists and is neither a symlink nor a directory", p);
925 log_error("Failed to remove %s: %m", p);
929 } else if (r != -ENOENT) {
930 log_error("readlink(%s) failed: %m", p);
934 if (arg_link_journal == LINK_GUEST) {
936 if (symlink(q, p) < 0) {
937 log_error("Failed to symlink %s to %s: %m", q, p);
941 r = mkdir_p(q, 0755);
943 log_warning("failed to create directory %s: %m", q);
947 if (arg_link_journal == LINK_HOST) {
948 r = mkdir_p(p, 0755);
950 log_error("Failed to create %s: %m", p);
954 } else if (access(p, F_OK) < 0)
957 if (dir_is_empty(q) == 0) {
958 log_error("%s not empty.", q);
962 r = mkdir_p(q, 0755);
964 log_error("Failed to create %s: %m", q);
968 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
969 log_error("Failed to bind mount journal from host into guest: %m");
976 static int setup_kdbus(const char *dest, const char *path) {
982 p = strappenda(dest, "/dev/kdbus");
983 if (mkdir(p, 0755) < 0) {
984 log_error("Failed to create kdbus path: %m");
988 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
989 log_error("Failed to mount kdbus namespace path: %m");
996 static int drop_capabilities(void) {
997 return capability_bounding_set_drop(~arg_retain, false);
1000 static int register_machine(pid_t pid) {
1001 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1002 _cleanup_bus_unref_ sd_bus *bus = NULL;
1005 r = sd_bus_open_system(&bus);
1007 log_error("Failed to open system bus: %s", strerror(-r));
1011 r = sd_bus_call_method(
1013 "org.freedesktop.machine1",
1014 "/org/freedesktop/machine1",
1015 "org.freedesktop.machine1.Manager",
1021 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1025 strempty(arg_directory),
1026 !isempty(arg_slice), "Slice", "s", arg_slice);
1028 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1035 static int terminate_machine(pid_t pid) {
1036 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1037 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1038 _cleanup_bus_unref_ sd_bus *bus = NULL;
1042 r = sd_bus_default_system(&bus);
1044 log_error("Failed to open system bus: %s", strerror(-r));
1048 r = sd_bus_call_method(
1050 "org.freedesktop.machine1",
1051 "/org/freedesktop/machine1",
1052 "org.freedesktop.machine1.Manager",
1059 /* Note that the machine might already have been
1060 * cleaned up automatically, hence don't consider it a
1061 * failure if we cannot get the machine object. */
1062 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1066 r = sd_bus_message_read(reply, "o", &path);
1068 return bus_log_parse_error(r);
1070 r = sd_bus_call_method(
1072 "org.freedesktop.machine1",
1074 "org.freedesktop.machine1.Machine",
1080 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1087 static bool audit_enabled(void) {
1090 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1092 close_nointr_nofail(fd);
1098 int main(int argc, char *argv[]) {
1100 int r = EXIT_FAILURE, k;
1101 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1103 const char *console = NULL;
1105 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1106 _cleanup_fdset_free_ FDSet *fds = NULL;
1107 _cleanup_free_ char *kdbus_namespace = NULL;
1110 log_parse_environment();
1113 k = parse_argv(argc, argv);
1121 if (arg_directory) {
1124 p = path_make_absolute_cwd(arg_directory);
1125 free(arg_directory);
1128 arg_directory = get_current_dir_name();
1130 if (!arg_directory) {
1131 log_error("Failed to determine path, please use -D.");
1135 path_kill_slashes(arg_directory);
1138 arg_machine = strdup(basename(arg_directory));
1144 hostname_cleanup(arg_machine, false);
1145 if (isempty(arg_machine)) {
1146 log_error("Failed to determine machine name automatically, please use -M.");
1151 if (geteuid() != 0) {
1152 log_error("Need to be root.");
1156 if (sd_booted() <= 0) {
1157 log_error("Not running on a systemd system.");
1161 if (arg_boot && audit_enabled()) {
1162 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1163 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1164 "line before using systemd-nspawn. Sleeping for 5s...\n");
1168 if (path_equal(arg_directory, "/")) {
1169 log_error("Spawning container on root directory not supported.");
1173 if (path_is_os_tree(arg_directory) <= 0) {
1174 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1179 n_fd_passed = sd_listen_fds(false);
1180 if (n_fd_passed > 0) {
1181 k = fdset_new_listen_fds(&fds, false);
1183 log_error("Failed to collect file descriptors: %s", strerror(-k));
1187 fdset_close_others(fds);
1190 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1192 log_error("Failed to acquire pseudo tty: %m");
1196 console = ptsname(master);
1198 log_error("Failed to determine tty name: %m");
1202 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1204 if (unlockpt(master) < 0) {
1205 log_error("Failed to unlock tty: %m");
1209 ns = strappenda("machine-", arg_machine);
1210 kdbus_fd = bus_kernel_create_namespace(ns, &kdbus_namespace);
1212 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1214 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1216 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1217 log_error("Failed to create kmsg socket pair: %m");
1221 sync_fd = eventfd(0, EFD_CLOEXEC);
1223 log_error("Failed to create event fd: %m");
1227 sd_notify(0, "READY=1");
1229 assert_se(sigemptyset(&mask) == 0);
1230 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1231 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1236 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1238 if (errno == EINVAL)
1239 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1241 log_error("clone() failed: %m");
1248 const char *home = NULL;
1249 uid_t uid = (uid_t) -1;
1250 gid_t gid = (gid_t) -1;
1252 const char *envp[] = {
1253 "PATH=" DEFAULT_PATH_SPLIT_USR,
1254 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1259 NULL, /* container_uuid */
1260 NULL, /* LISTEN_FDS */
1261 NULL, /* LISTEN_PID */
1267 envp[n_env] = strv_find_prefix(environ, "TERM=");
1271 close_nointr_nofail(master);
1274 close_nointr(STDIN_FILENO);
1275 close_nointr(STDOUT_FILENO);
1276 close_nointr(STDERR_FILENO);
1278 close_nointr_nofail(kmsg_socket_pair[0]);
1279 kmsg_socket_pair[0] = -1;
1281 reset_all_signal_handlers();
1283 assert_se(sigemptyset(&mask) == 0);
1284 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1286 k = open_terminal(console, O_RDWR);
1287 if (k != STDIN_FILENO) {
1289 close_nointr_nofail(k);
1293 log_error("Failed to open console: %s", strerror(-k));
1297 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1298 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1299 log_error("Failed to duplicate console: %m");
1304 log_error("setsid() failed: %m");
1308 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1309 log_error("PR_SET_PDEATHSIG failed: %m");
1313 /* Mark everything as slave, so that we still
1314 * receive mounts from the real root, but don't
1315 * propagate mounts to the real root. */
1316 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1317 log_error("MS_SLAVE|MS_REC failed: %m");
1321 /* Turn directory into bind mount */
1322 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1323 log_error("Failed to make bind mount.");
1328 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1329 log_error("Failed to make read-only.");
1333 if (mount_all(arg_directory) < 0)
1336 if (copy_devnodes(arg_directory) < 0)
1339 if (setup_ptmx(arg_directory) < 0)
1342 dev_setup(arg_directory);
1344 if (setup_dev_console(arg_directory, console) < 0)
1347 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1350 close_nointr_nofail(kmsg_socket_pair[1]);
1351 kmsg_socket_pair[1] = -1;
1353 if (setup_boot_id(arg_directory) < 0)
1356 if (setup_timezone(arg_directory) < 0)
1359 if (setup_resolv_conf(arg_directory) < 0)
1362 if (setup_journal(arg_directory) < 0)
1365 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1368 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1371 if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1374 if (chdir(arg_directory) < 0) {
1375 log_error("chdir(%s) failed: %m", arg_directory);
1379 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1380 log_error("mount(MS_MOVE) failed: %m");
1384 if (chroot(".") < 0) {
1385 log_error("chroot() failed: %m");
1389 if (chdir("/") < 0) {
1390 log_error("chdir() failed: %m");
1398 if (drop_capabilities() < 0) {
1399 log_error("drop_capabilities() failed: %m");
1405 /* Note that this resolves user names
1406 * inside the container, and hence
1407 * accesses the NSS modules from the
1408 * container and not the host. This is
1411 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1412 log_error("get_user_creds() failed: %m");
1416 if (mkdir_parents_label(home, 0775) < 0) {
1417 log_error("mkdir_parents_label() failed: %m");
1421 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1422 log_error("mkdir_safe_label() failed: %m");
1426 if (initgroups((const char*)arg_user, gid) < 0) {
1427 log_error("initgroups() failed: %m");
1431 if (setresgid(gid, gid, gid) < 0) {
1432 log_error("setregid() failed: %m");
1436 if (setresuid(uid, uid, uid) < 0) {
1437 log_error("setreuid() failed: %m");
1441 /* Reset everything fully to 0, just in case */
1443 if (setgroups(0, NULL) < 0) {
1444 log_error("setgroups() failed: %m");
1448 if (setresgid(0, 0, 0) < 0) {
1449 log_error("setregid() failed: %m");
1453 if (setresuid(0, 0, 0) < 0) {
1454 log_error("setreuid() failed: %m");
1459 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1460 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1461 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1466 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1467 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1473 if (fdset_size(fds) > 0) {
1474 k = fdset_cloexec(fds, false);
1476 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1480 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1481 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1489 eventfd_read(sync_fd, &x);
1490 close_nointr_nofail(sync_fd);
1493 if (!strv_isempty(arg_setenv)) {
1496 n = strv_env_merge(2, envp, arg_setenv);
1504 env_use = (char**) envp;
1510 /* Automatically search for the init system */
1512 l = 1 + argc - optind;
1513 a = newa(char*, l + 1);
1514 memcpy(a + 1, argv + optind, l * sizeof(char*));
1516 a[0] = (char*) "/usr/lib/systemd/systemd";
1517 execve(a[0], a, env_use);
1519 a[0] = (char*) "/lib/systemd/systemd";
1520 execve(a[0], a, env_use);
1522 a[0] = (char*) "/sbin/init";
1523 execve(a[0], a, env_use);
1524 } else if (argc > optind)
1525 execvpe(argv[optind], argv + optind, env_use);
1527 chdir(home ? home : "/root");
1528 execle("/bin/bash", "-bash", NULL, env_use);
1531 log_error("execv() failed: %m");
1534 _exit(EXIT_FAILURE);
1540 r = register_machine(pid);
1544 eventfd_write(sync_fd, 1);
1545 close_nointr_nofail(sync_fd);
1548 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1556 /* Kill if it is not dead yet anyway */
1557 terminate_machine(pid);
1559 /* Redundant, but better safe than sorry */
1562 k = wait_for_terminate(pid, &status);
1570 if (status.si_code == CLD_EXITED) {
1571 r = status.si_status;
1572 if (status.si_status != 0) {
1573 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1577 log_debug("Container %s exited successfully.", arg_machine);
1579 } else if (status.si_code == CLD_KILLED &&
1580 status.si_status == SIGINT) {
1581 log_info("Container %s has been shut down.", arg_machine);
1584 } else if (status.si_code == CLD_KILLED &&
1585 status.si_status == SIGHUP) {
1586 log_info("Container %s is being rebooted.", arg_machine);
1588 } else if (status.si_code == CLD_KILLED ||
1589 status.si_code == CLD_DUMPED) {
1591 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1595 log_error("Container %s failed due to unknown reason.", arg_machine);
1605 free(arg_directory);