1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include "sd-daemon.h"
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
62 #include "bus-error.h"
69 typedef enum LinkJournal {
76 static char *arg_directory = NULL;
77 static char *arg_user = NULL;
78 static sd_id128_t arg_uuid = {};
79 static char *arg_machine = NULL;
80 static const char *arg_slice = NULL;
81 static bool arg_private_network = false;
82 static bool arg_read_only = false;
83 static bool arg_boot = false;
84 static LinkJournal arg_link_journal = LINK_AUTO;
85 static uint64_t arg_retain =
87 (1ULL << CAP_DAC_OVERRIDE) |
88 (1ULL << CAP_DAC_READ_SEARCH) |
89 (1ULL << CAP_FOWNER) |
90 (1ULL << CAP_FSETID) |
91 (1ULL << CAP_IPC_OWNER) |
94 (1ULL << CAP_LINUX_IMMUTABLE) |
95 (1ULL << CAP_NET_BIND_SERVICE) |
96 (1ULL << CAP_NET_BROADCAST) |
97 (1ULL << CAP_NET_RAW) |
98 (1ULL << CAP_SETGID) |
99 (1ULL << CAP_SETFCAP) |
100 (1ULL << CAP_SETPCAP) |
101 (1ULL << CAP_SETUID) |
102 (1ULL << CAP_SYS_ADMIN) |
103 (1ULL << CAP_SYS_CHROOT) |
104 (1ULL << CAP_SYS_NICE) |
105 (1ULL << CAP_SYS_PTRACE) |
106 (1ULL << CAP_SYS_TTY_CONFIG) |
107 (1ULL << CAP_SYS_RESOURCE) |
108 (1ULL << CAP_SYS_BOOT) |
109 (1ULL << CAP_AUDIT_WRITE) |
110 (1ULL << CAP_AUDIT_CONTROL);
111 static char **arg_bind = NULL;
112 static char **arg_bind_ro = NULL;
114 static int help(void) {
116 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
117 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
118 " -h --help Show this help\n"
119 " --version Print version string\n"
120 " -D --directory=NAME Root directory for the container\n"
121 " -b --boot Boot up full system (i.e. invoke init)\n"
122 " -u --user=USER Run the command under specified user or uid\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " -S --slice=SLICE Place the container in the specified slice\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
130 " --drop-capability=CAP Drop the specified capability from the default set\n"
131 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
132 " -j Equivalent to --link-journal=host\n"
133 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
135 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
136 program_invocation_short_name);
141 static int parse_argv(int argc, char *argv[]) {
155 static const struct option options[] = {
156 { "help", no_argument, NULL, 'h' },
157 { "version", no_argument, NULL, ARG_VERSION },
158 { "directory", required_argument, NULL, 'D' },
159 { "user", required_argument, NULL, 'u' },
160 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
161 { "boot", no_argument, NULL, 'b' },
162 { "uuid", required_argument, NULL, ARG_UUID },
163 { "read-only", no_argument, NULL, ARG_READ_ONLY },
164 { "capability", required_argument, NULL, ARG_CAPABILITY },
165 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
166 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
167 { "bind", required_argument, NULL, ARG_BIND },
168 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
169 { "machine", required_argument, NULL, 'M' },
170 { "slice", required_argument, NULL, 'S' },
179 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
187 puts(PACKAGE_STRING);
188 puts(SYSTEMD_FEATURES);
193 arg_directory = canonicalize_file_name(optarg);
194 if (!arg_directory) {
195 log_error("Invalid root directory: %m");
203 arg_user = strdup(optarg);
209 case ARG_PRIVATE_NETWORK:
210 arg_private_network = true;
218 r = sd_id128_from_string(optarg, &arg_uuid);
220 log_error("Invalid UUID: %s", optarg);
226 arg_slice = strdup(optarg);
233 if (!hostname_is_valid(optarg)) {
234 log_error("Invalid machine name: %s", optarg);
239 arg_machine = strdup(optarg);
246 arg_read_only = true;
250 case ARG_DROP_CAPABILITY: {
254 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
258 t = strndup(word, length);
262 if (cap_from_name(t, &cap) < 0) {
263 log_error("Failed to parse capability %s.", t);
270 if (c == ARG_CAPABILITY)
271 arg_retain |= 1ULL << (uint64_t) cap;
273 arg_retain &= ~(1ULL << (uint64_t) cap);
280 arg_link_journal = LINK_GUEST;
283 case ARG_LINK_JOURNAL:
284 if (streq(optarg, "auto"))
285 arg_link_journal = LINK_AUTO;
286 else if (streq(optarg, "no"))
287 arg_link_journal = LINK_NO;
288 else if (streq(optarg, "guest"))
289 arg_link_journal = LINK_GUEST;
290 else if (streq(optarg, "host"))
291 arg_link_journal = LINK_HOST;
293 log_error("Failed to parse link journal mode %s", optarg);
301 _cleanup_free_ char *a = NULL, *b = NULL;
305 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
307 e = strchr(optarg, ':');
309 a = strndup(optarg, e - optarg);
319 if (!path_is_absolute(a) || !path_is_absolute(b)) {
320 log_error("Invalid bind mount specification: %s", optarg);
324 r = strv_extend(x, a);
328 r = strv_extend(x, b);
339 assert_not_reached("Unhandled option");
346 static int mount_all(const char *dest) {
348 typedef struct MountPoint {
357 static const MountPoint mount_table[] = {
358 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
359 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
360 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
361 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
362 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
363 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
364 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
365 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
367 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
368 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
375 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
376 _cleanup_free_ char *where = NULL;
379 where = strjoin(dest, "/", mount_table[k].where, NULL);
383 t = path_is_mount_point(where, true);
385 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
393 /* Skip this entry if it is not a remount. */
394 if (mount_table[k].what && t > 0)
397 mkdir_p(where, 0755);
399 if (mount(mount_table[k].what,
402 mount_table[k].flags,
403 mount_table[k].options) < 0 &&
404 mount_table[k].fatal) {
406 log_error("mount(%s) failed: %m", where);
416 static int mount_binds(const char *dest, char **l, unsigned long flags) {
419 STRV_FOREACH_PAIR(x, y, l) {
420 _cleanup_free_ char *where = NULL;
421 struct stat source_st, dest_st;
423 if (stat(*x, &source_st) < 0) {
424 log_error("failed to stat %s: %m", *x);
428 where = strjoin(dest, "/", *y, NULL);
432 if (stat(where, &dest_st) == 0) {
433 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
434 log_error("The file types of %s and %s do not match. Refusing bind mount",
439 /* Create the mount point, but be conservative -- refuse to create block
440 * and char devices. */
441 if (S_ISDIR(source_st.st_mode))
442 mkdir_p_label(where, 0755);
443 else if (S_ISFIFO(source_st.st_mode))
445 else if (S_ISSOCK(source_st.st_mode))
446 mknod(where, 0644 | S_IFSOCK, 0);
447 else if (S_ISREG(source_st.st_mode))
450 log_error("Refusing to create mountpoint for file: %s", *x);
455 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
456 log_error("mount(%s) failed: %m", where);
460 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
461 log_error("mount(%s) failed: %m", where);
469 static int setup_timezone(const char *dest) {
470 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
476 /* Fix the timezone, if possible */
477 r = readlink_malloc("/etc/localtime", &p);
479 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
483 z = path_startswith(p, "../usr/share/zoneinfo/");
485 z = path_startswith(p, "/usr/share/zoneinfo/");
487 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
491 where = strappend(dest, "/etc/localtime");
495 r = readlink_malloc(where, &q);
497 y = path_startswith(q, "../usr/share/zoneinfo/");
499 y = path_startswith(q, "/usr/share/zoneinfo/");
502 /* Already pointing to the right place? Then do nothing .. */
503 if (y && streq(y, z))
507 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
511 if (access(check, F_OK) < 0) {
512 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
516 what = strappend("../usr/share/zoneinfo/", z);
521 if (symlink(what, where) < 0) {
522 log_error("Failed to correct timezone of container: %m");
529 static int setup_resolv_conf(const char *dest) {
530 char _cleanup_free_ *where = NULL;
534 if (arg_private_network)
537 /* Fix resolv.conf, if possible */
538 where = strappend(dest, "/etc/resolv.conf");
542 /* We don't really care for the results of this really. If it
543 * fails, it fails, but meh... */
544 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
549 static int setup_boot_id(const char *dest) {
550 _cleanup_free_ char *from = NULL, *to = NULL;
557 /* Generate a new randomized boot ID, so that each boot-up of
558 * the container gets a new one */
560 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
561 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
565 r = sd_id128_randomize(&rnd);
567 log_error("Failed to generate random boot id: %s", strerror(-r));
571 snprintf(as_uuid, sizeof(as_uuid),
572 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
573 SD_ID128_FORMAT_VAL(rnd));
574 char_array_0(as_uuid);
576 r = write_string_file(from, as_uuid);
578 log_error("Failed to write boot id: %s", strerror(-r));
582 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
583 log_error("Failed to bind mount boot id: %m");
585 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
586 log_warning("Failed to make boot id read-only: %m");
592 static int copy_devnodes(const char *dest) {
594 static const char devnodes[] =
604 _cleanup_umask_ mode_t u;
610 NULSTR_FOREACH(d, devnodes) {
612 _cleanup_free_ char *from = NULL, *to = NULL;
614 asprintf(&from, "/dev/%s", d);
615 asprintf(&to, "%s/dev/%s", dest, d);
626 if (stat(from, &st) < 0) {
628 if (errno != ENOENT) {
629 log_error("Failed to stat %s: %m", from);
634 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
636 log_error("%s is not a char or block device, cannot copy", from);
640 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
642 log_error("mknod(%s) failed: %m", dest);
651 static int setup_ptmx(const char *dest) {
652 _cleanup_free_ char *p = NULL;
654 p = strappend(dest, "/dev/ptmx");
658 if (symlink("pts/ptmx", p) < 0) {
659 log_error("Failed to create /dev/ptmx symlink: %m");
666 static int setup_dev_console(const char *dest, const char *console) {
668 _cleanup_free_ char *to = NULL;
670 _cleanup_umask_ mode_t u;
677 if (stat(console, &st) < 0) {
678 log_error("Failed to stat %s: %m", console);
681 } else if (!S_ISCHR(st.st_mode)) {
682 log_error("/dev/console is not a char device");
686 r = chmod_and_chown(console, 0600, 0, 0);
688 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
692 if (asprintf(&to, "%s/dev/console", dest) < 0)
695 /* We need to bind mount the right tty to /dev/console since
696 * ptys can only exist on pts file systems. To have something
697 * to bind mount things on we create a device node first, that
698 * has the right major/minor (note that the major minor
699 * doesn't actually matter here, since we mount it over
702 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
703 log_error("mknod() for /dev/console failed: %m");
707 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
708 log_error("Bind mount for /dev/console failed: %m");
715 static int setup_kmsg(const char *dest, int kmsg_socket) {
716 _cleanup_free_ char *from = NULL, *to = NULL;
718 _cleanup_umask_ mode_t u;
720 struct cmsghdr cmsghdr;
721 uint8_t buf[CMSG_SPACE(sizeof(int))];
724 .msg_control = &control,
725 .msg_controllen = sizeof(control),
727 struct cmsghdr *cmsg;
730 assert(kmsg_socket >= 0);
734 /* We create the kmsg FIFO as /dev/kmsg, but immediately
735 * delete it after bind mounting it to /proc/kmsg. While FIFOs
736 * on the reading side behave very similar to /proc/kmsg,
737 * their writing side behaves differently from /dev/kmsg in
738 * that writing blocks when nothing is reading. In order to
739 * avoid any problems with containers deadlocking due to this
740 * we simply make /dev/kmsg unavailable to the container. */
741 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
742 asprintf(&to, "%s/proc/kmsg", dest) < 0)
745 if (mkfifo(from, 0600) < 0) {
746 log_error("mkfifo() for /dev/kmsg failed: %m");
750 r = chmod_and_chown(from, 0600, 0, 0);
752 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
756 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
757 log_error("Bind mount for /proc/kmsg failed: %m");
761 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
763 log_error("Failed to open fifo: %m");
767 cmsg = CMSG_FIRSTHDR(&mh);
768 cmsg->cmsg_level = SOL_SOCKET;
769 cmsg->cmsg_type = SCM_RIGHTS;
770 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
771 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
773 mh.msg_controllen = cmsg->cmsg_len;
775 /* Store away the fd in the socket, so that it stays open as
776 * long as we run the child */
777 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
778 close_nointr_nofail(fd);
781 log_error("Failed to send FIFO fd: %m");
785 /* And now make the FIFO unavailable as /dev/kmsg... */
790 static int setup_hostname(void) {
792 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
798 static int setup_journal(const char *directory) {
799 sd_id128_t machine_id;
800 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
804 if (arg_link_journal == LINK_NO)
807 p = strappend(directory, "/etc/machine-id");
811 r = read_one_line_file(p, &b);
812 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
815 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
820 if (isempty(id) && arg_link_journal == LINK_AUTO)
823 /* Verify validity */
824 r = sd_id128_from_string(id, &machine_id);
826 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
831 p = strappend("/var/log/journal/", id);
832 q = strjoin(directory, "/var/log/journal/", id, NULL);
836 if (path_is_mount_point(p, false) > 0) {
837 if (arg_link_journal != LINK_AUTO) {
838 log_error("%s: already a mount point, refusing to use for journal", p);
845 if (path_is_mount_point(q, false) > 0) {
846 if (arg_link_journal != LINK_AUTO) {
847 log_error("%s: already a mount point, refusing to use for journal", q);
854 r = readlink_and_make_absolute(p, &d);
856 if ((arg_link_journal == LINK_GUEST ||
857 arg_link_journal == LINK_AUTO) &&
860 r = mkdir_p(q, 0755);
862 log_warning("failed to create directory %s: %m", q);
867 log_error("Failed to remove symlink %s: %m", p);
870 } else if (r == -EINVAL) {
872 if (arg_link_journal == LINK_GUEST &&
875 if (errno == ENOTDIR) {
876 log_error("%s already exists and is neither a symlink nor a directory", p);
879 log_error("Failed to remove %s: %m", p);
883 } else if (r != -ENOENT) {
884 log_error("readlink(%s) failed: %m", p);
888 if (arg_link_journal == LINK_GUEST) {
890 if (symlink(q, p) < 0) {
891 log_error("Failed to symlink %s to %s: %m", q, p);
895 r = mkdir_p(q, 0755);
897 log_warning("failed to create directory %s: %m", q);
901 if (arg_link_journal == LINK_HOST) {
902 r = mkdir_p(p, 0755);
904 log_error("Failed to create %s: %m", p);
908 } else if (access(p, F_OK) < 0)
911 if (dir_is_empty(q) == 0) {
912 log_error("%s not empty.", q);
916 r = mkdir_p(q, 0755);
918 log_error("Failed to create %s: %m", q);
922 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
923 log_error("Failed to bind mount journal from host into guest: %m");
930 static int drop_capabilities(void) {
931 return capability_bounding_set_drop(~arg_retain, false);
934 static int register_machine(void) {
935 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
936 _cleanup_bus_unref_ sd_bus *bus = NULL;
939 r = sd_bus_open_system(&bus);
941 log_error("Failed to open system bus: %s", strerror(-r));
945 r = sd_bus_call_method(
947 "org.freedesktop.machine1",
948 "/org/freedesktop/machine1",
949 "org.freedesktop.machine1.Manager",
955 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
959 strempty(arg_directory),
960 !isempty(arg_slice), "Slice", "s", arg_slice);
962 log_error("Failed to register machine: %s", bus_error_message(&error, r));
969 static int terminate_machine(pid_t pid) {
970 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
971 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
972 _cleanup_bus_unref_ sd_bus *bus = NULL;
976 r = sd_bus_default_system(&bus);
978 log_error("Failed to open system bus: %s", strerror(-r));
982 r = sd_bus_call_method(
984 "org.freedesktop.machine1",
985 "/org/freedesktop/machine1",
986 "org.freedesktop.machine1.Manager",
993 /* Note that the machine might already have been
994 * cleaned up automatically, hence don't consider it a
995 * failure if we cannot get the machine object. */
996 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1000 r = sd_bus_message_read(reply, "o", &path);
1002 return bus_log_parse_error(r);
1004 r = sd_bus_call_method(
1006 "org.freedesktop.machine1",
1008 "org.freedesktop.machine1.Machine",
1014 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1021 static bool audit_enabled(void) {
1024 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1026 close_nointr_nofail(fd);
1032 int main(int argc, char *argv[]) {
1034 int r = EXIT_FAILURE, k;
1035 _cleanup_close_ int master = -1;
1037 const char *console = NULL;
1039 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1040 _cleanup_fdset_free_ FDSet *fds = NULL;
1042 log_parse_environment();
1045 k = parse_argv(argc, argv);
1053 if (arg_directory) {
1056 p = path_make_absolute_cwd(arg_directory);
1057 free(arg_directory);
1060 arg_directory = get_current_dir_name();
1062 if (!arg_directory) {
1063 log_error("Failed to determine path, please use -D.");
1067 path_kill_slashes(arg_directory);
1070 arg_machine = strdup(path_get_file_name(arg_directory));
1076 hostname_cleanup(arg_machine, false);
1077 if (isempty(arg_machine)) {
1078 log_error("Failed to determine machine name automatically, please use -M.");
1083 if (geteuid() != 0) {
1084 log_error("Need to be root.");
1088 if (sd_booted() <= 0) {
1089 log_error("Not running on a systemd system.");
1093 if (arg_boot && audit_enabled()) {
1094 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1095 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1096 "line before using systemd-nspawn. Sleeping for 5s...\n");
1100 if (path_equal(arg_directory, "/")) {
1101 log_error("Spawning container on root directory not supported.");
1105 if (path_is_os_tree(arg_directory) <= 0) {
1106 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1111 n_fd_passed = sd_listen_fds(false);
1112 if (n_fd_passed > 0) {
1113 k = fdset_new_listen_fds(&fds, false);
1115 log_error("Failed to collect file descriptors: %s", strerror(-k));
1119 fdset_close_others(fds);
1122 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1124 log_error("Failed to acquire pseudo tty: %m");
1128 console = ptsname(master);
1130 log_error("Failed to determine tty name: %m");
1134 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1136 if (unlockpt(master) < 0) {
1137 log_error("Failed to unlock tty: %m");
1141 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1142 log_error("Failed to create kmsg socket pair.");
1146 sd_notify(0, "READY=1");
1148 assert_se(sigemptyset(&mask) == 0);
1149 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1150 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1155 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1157 if (errno == EINVAL)
1158 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1160 log_error("clone() failed: %m");
1167 const char *home = NULL;
1168 uid_t uid = (uid_t) -1;
1169 gid_t gid = (gid_t) -1;
1171 const char *envp[] = {
1172 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1173 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1178 NULL, /* container_uuid */
1179 NULL, /* LISTEN_FDS */
1180 NULL, /* LISTEN_PID */
1184 envp[n_env] = strv_find_prefix(environ, "TERM=");
1188 close_nointr_nofail(master);
1191 close_nointr(STDIN_FILENO);
1192 close_nointr(STDOUT_FILENO);
1193 close_nointr(STDERR_FILENO);
1195 close_nointr_nofail(kmsg_socket_pair[0]);
1196 kmsg_socket_pair[0] = -1;
1198 reset_all_signal_handlers();
1200 assert_se(sigemptyset(&mask) == 0);
1201 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1203 k = open_terminal(console, O_RDWR);
1204 if (k != STDIN_FILENO) {
1206 close_nointr_nofail(k);
1210 log_error("Failed to open console: %s", strerror(-k));
1214 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1215 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1216 log_error("Failed to duplicate console: %m");
1221 log_error("setsid() failed: %m");
1225 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1226 log_error("PR_SET_PDEATHSIG failed: %m");
1230 r = register_machine();
1234 /* Mark everything as slave, so that we still
1235 * receive mounts from the real root, but don't
1236 * propagate mounts to the real root. */
1237 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1238 log_error("MS_SLAVE|MS_REC failed: %m");
1242 /* Turn directory into bind mount */
1243 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1244 log_error("Failed to make bind mount.");
1249 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1250 log_error("Failed to make read-only.");
1254 if (mount_all(arg_directory) < 0)
1257 if (copy_devnodes(arg_directory) < 0)
1260 if (setup_ptmx(arg_directory) < 0)
1263 dev_setup(arg_directory);
1265 if (setup_dev_console(arg_directory, console) < 0)
1268 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1271 close_nointr_nofail(kmsg_socket_pair[1]);
1272 kmsg_socket_pair[1] = -1;
1274 if (setup_boot_id(arg_directory) < 0)
1277 if (setup_timezone(arg_directory) < 0)
1280 if (setup_resolv_conf(arg_directory) < 0)
1283 if (setup_journal(arg_directory) < 0)
1286 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1289 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1292 if (chdir(arg_directory) < 0) {
1293 log_error("chdir(%s) failed: %m", arg_directory);
1297 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1298 log_error("mount(MS_MOVE) failed: %m");
1302 if (chroot(".") < 0) {
1303 log_error("chroot() failed: %m");
1307 if (chdir("/") < 0) {
1308 log_error("chdir() failed: %m");
1316 if (drop_capabilities() < 0) {
1317 log_error("drop_capabilities() failed: %m");
1323 /* Note that this resolves user names
1324 * inside the container, and hence
1325 * accesses the NSS modules from the
1326 * container and not the host. This is
1329 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1330 log_error("get_user_creds() failed: %m");
1334 if (mkdir_parents_label(home, 0775) < 0) {
1335 log_error("mkdir_parents_label() failed: %m");
1339 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1340 log_error("mkdir_safe_label() failed: %m");
1344 if (initgroups((const char*)arg_user, gid) < 0) {
1345 log_error("initgroups() failed: %m");
1349 if (setresgid(gid, gid, gid) < 0) {
1350 log_error("setregid() failed: %m");
1354 if (setresuid(uid, uid, uid) < 0) {
1355 log_error("setreuid() failed: %m");
1359 /* Reset everything fully to 0, just in case */
1361 if (setgroups(0, NULL) < 0) {
1362 log_error("setgroups() failed: %m");
1366 if (setresgid(0, 0, 0) < 0) {
1367 log_error("setregid() failed: %m");
1371 if (setresuid(0, 0, 0) < 0) {
1372 log_error("setreuid() failed: %m");
1377 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1378 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1379 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1384 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1385 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1391 if (fdset_size(fds) > 0) {
1392 k = fdset_cloexec(fds, false);
1394 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1398 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1399 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1411 /* Automatically search for the init system */
1413 l = 1 + argc - optind;
1414 a = newa(char*, l + 1);
1415 memcpy(a + 1, argv + optind, l * sizeof(char*));
1417 a[0] = (char*) "/usr/lib/systemd/systemd";
1418 execve(a[0], a, (char**) envp);
1420 a[0] = (char*) "/lib/systemd/systemd";
1421 execve(a[0], a, (char**) envp);
1423 a[0] = (char*) "/sbin/init";
1424 execve(a[0], a, (char**) envp);
1425 } else if (argc > optind)
1426 execvpe(argv[optind], argv + optind, (char**) envp);
1428 chdir(home ? home : "/root");
1429 execle("/bin/bash", "-bash", NULL, (char**) envp);
1432 log_error("execv() failed: %m");
1435 _exit(EXIT_FAILURE);
1441 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1449 /* Kill if it is not dead yet anyway */
1450 terminate_machine(pid);
1452 /* Redundant, but better safe than sorry */
1455 k = wait_for_terminate(pid, &status);
1463 if (status.si_code == CLD_EXITED) {
1464 r = status.si_status;
1465 if (status.si_status != 0) {
1466 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1470 log_debug("Container %s exited successfully.", arg_machine);
1472 } else if (status.si_code == CLD_KILLED &&
1473 status.si_status == SIGINT) {
1474 log_info("Container %s has been shut down.", arg_machine);
1477 } else if (status.si_code == CLD_KILLED &&
1478 status.si_status == SIGHUP) {
1479 log_info("Container %s is being rebooted.", arg_machine);
1481 } else if (status.si_code == CLD_KILLED ||
1482 status.si_code == CLD_DUMPED) {
1484 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1488 log_error("Container %s failed due to unknown reason.", arg_machine);
1498 free(arg_directory);