1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include "sd-daemon.h"
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
62 #include "bus-error.h"
69 typedef enum LinkJournal {
76 static char *arg_directory = NULL;
77 static char *arg_user = NULL;
78 static sd_id128_t arg_uuid = {};
79 static char *arg_machine = NULL;
80 static const char *arg_slice = NULL;
81 static bool arg_private_network = false;
82 static bool arg_read_only = false;
83 static bool arg_boot = false;
84 static LinkJournal arg_link_journal = LINK_AUTO;
85 static uint64_t arg_retain =
87 (1ULL << CAP_DAC_OVERRIDE) |
88 (1ULL << CAP_DAC_READ_SEARCH) |
89 (1ULL << CAP_FOWNER) |
90 (1ULL << CAP_FSETID) |
91 (1ULL << CAP_IPC_OWNER) |
94 (1ULL << CAP_LINUX_IMMUTABLE) |
95 (1ULL << CAP_NET_BIND_SERVICE) |
96 (1ULL << CAP_NET_BROADCAST) |
97 (1ULL << CAP_NET_RAW) |
98 (1ULL << CAP_SETGID) |
99 (1ULL << CAP_SETFCAP) |
100 (1ULL << CAP_SETPCAP) |
101 (1ULL << CAP_SETUID) |
102 (1ULL << CAP_SYS_ADMIN) |
103 (1ULL << CAP_SYS_CHROOT) |
104 (1ULL << CAP_SYS_NICE) |
105 (1ULL << CAP_SYS_PTRACE) |
106 (1ULL << CAP_SYS_TTY_CONFIG) |
107 (1ULL << CAP_SYS_RESOURCE) |
108 (1ULL << CAP_SYS_BOOT) |
109 (1ULL << CAP_AUDIT_WRITE) |
110 (1ULL << CAP_AUDIT_CONTROL);
111 static char **arg_bind = NULL;
112 static char **arg_bind_ro = NULL;
114 static int help(void) {
116 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
117 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
118 " -h --help Show this help\n"
119 " --version Print version string\n"
120 " -D --directory=NAME Root directory for the container\n"
121 " -b --boot Boot up full system (i.e. invoke init)\n"
122 " -u --user=USER Run the command under specified user or uid\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " -S --slice=SLICE Place the container in the specified slice\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135 program_invocation_short_name);
140 static int parse_argv(int argc, char *argv[]) {
153 static const struct option options[] = {
154 { "help", no_argument, NULL, 'h' },
155 { "version", no_argument, NULL, ARG_VERSION },
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
159 { "boot", no_argument, NULL, 'b' },
160 { "uuid", required_argument, NULL, ARG_UUID },
161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
162 { "capability", required_argument, NULL, ARG_CAPABILITY },
163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
166 { "machine", required_argument, NULL, 'M' },
167 { "slice", required_argument, NULL, 'S' },
176 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
200 arg_user = strdup(optarg);
206 case ARG_PRIVATE_NETWORK:
207 arg_private_network = true;
215 r = sd_id128_from_string(optarg, &arg_uuid);
217 log_error("Invalid UUID: %s", optarg);
223 arg_slice = strdup(optarg);
230 if (!hostname_is_valid(optarg)) {
231 log_error("Invalid machine name: %s", optarg);
236 arg_machine = strdup(optarg);
243 arg_read_only = true;
246 case ARG_CAPABILITY: {
250 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
254 t = strndup(word, length);
258 if (cap_from_name(t, &cap) < 0) {
259 log_error("Failed to parse capability %s.", t);
265 arg_retain |= 1ULL << (uint64_t) cap;
272 arg_link_journal = LINK_GUEST;
275 case ARG_LINK_JOURNAL:
276 if (streq(optarg, "auto"))
277 arg_link_journal = LINK_AUTO;
278 else if (streq(optarg, "no"))
279 arg_link_journal = LINK_NO;
280 else if (streq(optarg, "guest"))
281 arg_link_journal = LINK_GUEST;
282 else if (streq(optarg, "host"))
283 arg_link_journal = LINK_HOST;
285 log_error("Failed to parse link journal mode %s", optarg);
293 _cleanup_free_ char *a = NULL, *b = NULL;
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
299 e = strchr(optarg, ':');
301 a = strndup(optarg, e - optarg);
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
316 r = strv_extend(x, a);
320 r = strv_extend(x, b);
331 assert_not_reached("Unhandled option");
338 static int mount_all(const char *dest) {
340 typedef struct MountPoint {
349 static const MountPoint mount_table[] = {
350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368 _cleanup_free_ char *where = NULL;
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
375 t = path_is_mount_point(where, true);
377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
389 mkdir_p(where, 0755);
391 if (mount(mount_table[k].what,
394 mount_table[k].flags,
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
398 log_error("mount(%s) failed: %m", where);
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
413 struct stat source_st, dest_st;
415 if (stat(*x, &source_st) < 0) {
416 log_error("failed to stat %s: %m", *x);
420 where = strjoin(dest, "/", *y, NULL);
424 if (stat(where, &dest_st) == 0) {
425 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
426 log_error("The file types of %s and %s do not match. Refusing bind mount",
431 /* Create the mount point, but be conservative -- refuse to create block
432 * and char devices. */
433 if (S_ISDIR(source_st.st_mode))
434 mkdir_p_label(where, 0755);
435 else if (S_ISFIFO(source_st.st_mode))
437 else if (S_ISSOCK(source_st.st_mode))
438 mknod(where, 0644 | S_IFSOCK, 0);
439 else if (S_ISREG(source_st.st_mode))
442 log_error("Refusing to create mountpoint for file: %s", *x);
447 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
448 log_error("mount(%s) failed: %m", where);
452 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
453 log_error("mount(%s) failed: %m", where);
461 static int setup_timezone(const char *dest) {
462 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
468 /* Fix the timezone, if possible */
469 r = readlink_malloc("/etc/localtime", &p);
471 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
475 z = path_startswith(p, "../usr/share/zoneinfo/");
477 z = path_startswith(p, "/usr/share/zoneinfo/");
479 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
483 where = strappend(dest, "/etc/localtime");
487 r = readlink_malloc(where, &q);
489 y = path_startswith(q, "../usr/share/zoneinfo/");
491 y = path_startswith(q, "/usr/share/zoneinfo/");
494 /* Already pointing to the right place? Then do nothing .. */
495 if (y && streq(y, z))
499 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
503 if (access(check, F_OK) < 0) {
504 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
508 what = strappend("../usr/share/zoneinfo/", z);
513 if (symlink(what, where) < 0) {
514 log_error("Failed to correct timezone of container: %m");
521 static int setup_resolv_conf(const char *dest) {
522 char _cleanup_free_ *where = NULL;
526 if (arg_private_network)
529 /* Fix resolv.conf, if possible */
530 where = strappend(dest, "/etc/resolv.conf");
534 /* We don't really care for the results of this really. If it
535 * fails, it fails, but meh... */
536 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
541 static int setup_boot_id(const char *dest) {
542 _cleanup_free_ char *from = NULL, *to = NULL;
549 /* Generate a new randomized boot ID, so that each boot-up of
550 * the container gets a new one */
552 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
553 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
557 r = sd_id128_randomize(&rnd);
559 log_error("Failed to generate random boot id: %s", strerror(-r));
563 snprintf(as_uuid, sizeof(as_uuid),
564 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
565 SD_ID128_FORMAT_VAL(rnd));
566 char_array_0(as_uuid);
568 r = write_string_file(from, as_uuid);
570 log_error("Failed to write boot id: %s", strerror(-r));
574 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
575 log_error("Failed to bind mount boot id: %m");
577 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
578 log_warning("Failed to make boot id read-only: %m");
584 static int copy_devnodes(const char *dest) {
586 static const char devnodes[] =
596 _cleanup_umask_ mode_t u;
602 NULSTR_FOREACH(d, devnodes) {
604 _cleanup_free_ char *from = NULL, *to = NULL;
606 asprintf(&from, "/dev/%s", d);
607 asprintf(&to, "%s/dev/%s", dest, d);
618 if (stat(from, &st) < 0) {
620 if (errno != ENOENT) {
621 log_error("Failed to stat %s: %m", from);
626 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
628 log_error("%s is not a char or block device, cannot copy", from);
632 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
634 log_error("mknod(%s) failed: %m", dest);
643 static int setup_ptmx(const char *dest) {
644 _cleanup_free_ char *p = NULL;
646 p = strappend(dest, "/dev/ptmx");
650 if (symlink("pts/ptmx", p) < 0) {
651 log_error("Failed to create /dev/ptmx symlink: %m");
658 static int setup_dev_console(const char *dest, const char *console) {
660 _cleanup_free_ char *to = NULL;
662 _cleanup_umask_ mode_t u;
669 if (stat(console, &st) < 0) {
670 log_error("Failed to stat %s: %m", console);
673 } else if (!S_ISCHR(st.st_mode)) {
674 log_error("/dev/console is not a char device");
678 r = chmod_and_chown(console, 0600, 0, 0);
680 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
684 if (asprintf(&to, "%s/dev/console", dest) < 0)
687 /* We need to bind mount the right tty to /dev/console since
688 * ptys can only exist on pts file systems. To have something
689 * to bind mount things on we create a device node first, that
690 * has the right major/minor (note that the major minor
691 * doesn't actually matter here, since we mount it over
694 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
695 log_error("mknod() for /dev/console failed: %m");
699 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
700 log_error("Bind mount for /dev/console failed: %m");
707 static int setup_kmsg(const char *dest, int kmsg_socket) {
708 _cleanup_free_ char *from = NULL, *to = NULL;
710 _cleanup_umask_ mode_t u;
712 struct cmsghdr cmsghdr;
713 uint8_t buf[CMSG_SPACE(sizeof(int))];
716 .msg_control = &control,
717 .msg_controllen = sizeof(control),
719 struct cmsghdr *cmsg;
722 assert(kmsg_socket >= 0);
726 /* We create the kmsg FIFO as /dev/kmsg, but immediately
727 * delete it after bind mounting it to /proc/kmsg. While FIFOs
728 * on the reading side behave very similar to /proc/kmsg,
729 * their writing side behaves differently from /dev/kmsg in
730 * that writing blocks when nothing is reading. In order to
731 * avoid any problems with containers deadlocking due to this
732 * we simply make /dev/kmsg unavailable to the container. */
733 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
734 asprintf(&to, "%s/proc/kmsg", dest) < 0)
737 if (mkfifo(from, 0600) < 0) {
738 log_error("mkfifo() for /dev/kmsg failed: %m");
742 r = chmod_and_chown(from, 0600, 0, 0);
744 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
748 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
749 log_error("Bind mount for /proc/kmsg failed: %m");
753 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
755 log_error("Failed to open fifo: %m");
759 cmsg = CMSG_FIRSTHDR(&mh);
760 cmsg->cmsg_level = SOL_SOCKET;
761 cmsg->cmsg_type = SCM_RIGHTS;
762 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
763 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
765 mh.msg_controllen = cmsg->cmsg_len;
767 /* Store away the fd in the socket, so that it stays open as
768 * long as we run the child */
769 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
770 close_nointr_nofail(fd);
773 log_error("Failed to send FIFO fd: %m");
777 /* And now make the FIFO unavailable as /dev/kmsg... */
782 static int setup_hostname(void) {
784 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
790 static int setup_journal(const char *directory) {
791 sd_id128_t machine_id;
792 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
796 if (arg_link_journal == LINK_NO)
799 p = strappend(directory, "/etc/machine-id");
803 r = read_one_line_file(p, &b);
804 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
807 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
812 if (isempty(id) && arg_link_journal == LINK_AUTO)
815 /* Verify validity */
816 r = sd_id128_from_string(id, &machine_id);
818 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
823 p = strappend("/var/log/journal/", id);
824 q = strjoin(directory, "/var/log/journal/", id, NULL);
828 if (path_is_mount_point(p, false) > 0) {
829 if (arg_link_journal != LINK_AUTO) {
830 log_error("%s: already a mount point, refusing to use for journal", p);
837 if (path_is_mount_point(q, false) > 0) {
838 if (arg_link_journal != LINK_AUTO) {
839 log_error("%s: already a mount point, refusing to use for journal", q);
846 r = readlink_and_make_absolute(p, &d);
848 if ((arg_link_journal == LINK_GUEST ||
849 arg_link_journal == LINK_AUTO) &&
852 r = mkdir_p(q, 0755);
854 log_warning("failed to create directory %s: %m", q);
859 log_error("Failed to remove symlink %s: %m", p);
862 } else if (r == -EINVAL) {
864 if (arg_link_journal == LINK_GUEST &&
867 if (errno == ENOTDIR) {
868 log_error("%s already exists and is neither a symlink nor a directory", p);
871 log_error("Failed to remove %s: %m", p);
875 } else if (r != -ENOENT) {
876 log_error("readlink(%s) failed: %m", p);
880 if (arg_link_journal == LINK_GUEST) {
882 if (symlink(q, p) < 0) {
883 log_error("Failed to symlink %s to %s: %m", q, p);
887 r = mkdir_p(q, 0755);
889 log_warning("failed to create directory %s: %m", q);
893 if (arg_link_journal == LINK_HOST) {
894 r = mkdir_p(p, 0755);
896 log_error("Failed to create %s: %m", p);
900 } else if (access(p, F_OK) < 0)
903 if (dir_is_empty(q) == 0) {
904 log_error("%s not empty.", q);
908 r = mkdir_p(q, 0755);
910 log_error("Failed to create %s: %m", q);
914 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
915 log_error("Failed to bind mount journal from host into guest: %m");
922 static int drop_capabilities(void) {
923 return capability_bounding_set_drop(~arg_retain, false);
926 static int register_machine(void) {
927 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
928 _cleanup_bus_unref_ sd_bus *bus = NULL;
931 r = sd_bus_open_system(&bus);
933 log_error("Failed to open system bus: %s", strerror(-r));
937 r = sd_bus_call_method(
939 "org.freedesktop.machine1",
940 "/org/freedesktop/machine1",
941 "org.freedesktop.machine1.Manager",
947 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
951 strempty(arg_directory),
952 !isempty(arg_slice), "Slice", "s", arg_slice);
954 log_error("Failed to register machine: %s", bus_error_message(&error, r));
961 static int terminate_machine(pid_t pid) {
962 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
963 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
964 _cleanup_bus_unref_ sd_bus *bus = NULL;
968 r = sd_bus_open_system(&bus);
970 log_error("Failed to open system bus: %s", strerror(-r));
974 r = sd_bus_call_method(
976 "org.freedesktop.machine1",
977 "/org/freedesktop/machine1",
978 "org.freedesktop.machine1.Manager",
985 /* Note that the machine might already have been
986 * cleaned up automatically, hence don't consider it a
987 * failure if we cannot get the machine object. */
988 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
992 r = sd_bus_message_read(reply, "o", &path);
994 log_error("Failed to parse GetMachineByPID() reply: %s", bus_error_message(&error, r));
998 r = sd_bus_call_method(
1000 "org.freedesktop.machine1",
1002 "org.freedesktop.machine1.Machine",
1008 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1015 static bool audit_enabled(void) {
1018 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1020 close_nointr_nofail(fd);
1026 int main(int argc, char *argv[]) {
1028 int r = EXIT_FAILURE, k;
1029 _cleanup_close_ int master = -1;
1031 const char *console = NULL;
1033 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1034 _cleanup_fdset_free_ FDSet *fds = NULL;
1036 log_parse_environment();
1039 k = parse_argv(argc, argv);
1047 if (arg_directory) {
1050 p = path_make_absolute_cwd(arg_directory);
1051 free(arg_directory);
1054 arg_directory = get_current_dir_name();
1056 if (!arg_directory) {
1057 log_error("Failed to determine path, please use -D.");
1061 path_kill_slashes(arg_directory);
1064 arg_machine = strdup(path_get_file_name(arg_directory));
1070 hostname_cleanup(arg_machine, false);
1071 if (isempty(arg_machine)) {
1072 log_error("Failed to determine machine name automatically, please use -M.");
1077 if (geteuid() != 0) {
1078 log_error("Need to be root.");
1082 if (sd_booted() <= 0) {
1083 log_error("Not running on a systemd system.");
1087 if (arg_boot && audit_enabled()) {
1088 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1089 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1090 "line before using systemd-nspawn. Sleeping for 5s...\n");
1094 if (path_equal(arg_directory, "/")) {
1095 log_error("Spawning container on root directory not supported.");
1099 if (path_is_os_tree(arg_directory) <= 0) {
1100 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1105 n_fd_passed = sd_listen_fds(false);
1106 if (n_fd_passed > 0) {
1107 k = fdset_new_listen_fds(&fds, false);
1109 log_error("Failed to collect file descriptors: %s", strerror(-k));
1113 fdset_close_others(fds);
1116 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1118 log_error("Failed to acquire pseudo tty: %m");
1122 console = ptsname(master);
1124 log_error("Failed to determine tty name: %m");
1128 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1130 if (unlockpt(master) < 0) {
1131 log_error("Failed to unlock tty: %m");
1135 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1136 log_error("Failed to create kmsg socket pair.");
1140 sd_notify(0, "READY=1");
1142 assert_se(sigemptyset(&mask) == 0);
1143 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1144 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1149 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1151 if (errno == EINVAL)
1152 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1154 log_error("clone() failed: %m");
1161 const char *home = NULL;
1162 uid_t uid = (uid_t) -1;
1163 gid_t gid = (gid_t) -1;
1165 const char *envp[] = {
1166 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1167 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1172 NULL, /* container_uuid */
1173 NULL, /* LISTEN_FDS */
1174 NULL, /* LISTEN_PID */
1178 envp[n_env] = strv_find_prefix(environ, "TERM=");
1182 close_nointr_nofail(master);
1185 close_nointr(STDIN_FILENO);
1186 close_nointr(STDOUT_FILENO);
1187 close_nointr(STDERR_FILENO);
1189 close_nointr_nofail(kmsg_socket_pair[0]);
1190 kmsg_socket_pair[0] = -1;
1192 reset_all_signal_handlers();
1194 assert_se(sigemptyset(&mask) == 0);
1195 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1197 k = open_terminal(console, O_RDWR);
1198 if (k != STDIN_FILENO) {
1200 close_nointr_nofail(k);
1204 log_error("Failed to open console: %s", strerror(-k));
1208 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1209 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1210 log_error("Failed to duplicate console: %m");
1215 log_error("setsid() failed: %m");
1219 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1220 log_error("PR_SET_PDEATHSIG failed: %m");
1224 r = register_machine();
1228 /* Mark everything as slave, so that we still
1229 * receive mounts from the real root, but don't
1230 * propagate mounts to the real root. */
1231 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1232 log_error("MS_SLAVE|MS_REC failed: %m");
1236 /* Turn directory into bind mount */
1237 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1238 log_error("Failed to make bind mount.");
1243 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1244 log_error("Failed to make read-only.");
1248 if (mount_all(arg_directory) < 0)
1251 if (copy_devnodes(arg_directory) < 0)
1254 if (setup_ptmx(arg_directory) < 0)
1257 dev_setup(arg_directory);
1259 if (setup_dev_console(arg_directory, console) < 0)
1262 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1265 close_nointr_nofail(kmsg_socket_pair[1]);
1266 kmsg_socket_pair[1] = -1;
1268 if (setup_boot_id(arg_directory) < 0)
1271 if (setup_timezone(arg_directory) < 0)
1274 if (setup_resolv_conf(arg_directory) < 0)
1277 if (setup_journal(arg_directory) < 0)
1280 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1283 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1286 if (chdir(arg_directory) < 0) {
1287 log_error("chdir(%s) failed: %m", arg_directory);
1291 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1292 log_error("mount(MS_MOVE) failed: %m");
1296 if (chroot(".") < 0) {
1297 log_error("chroot() failed: %m");
1301 if (chdir("/") < 0) {
1302 log_error("chdir() failed: %m");
1310 if (drop_capabilities() < 0) {
1311 log_error("drop_capabilities() failed: %m");
1317 /* Note that this resolves user names
1318 * inside the container, and hence
1319 * accesses the NSS modules from the
1320 * container and not the host. This is
1323 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1324 log_error("get_user_creds() failed: %m");
1328 if (mkdir_parents_label(home, 0775) < 0) {
1329 log_error("mkdir_parents_label() failed: %m");
1333 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1334 log_error("mkdir_safe_label() failed: %m");
1338 if (initgroups((const char*)arg_user, gid) < 0) {
1339 log_error("initgroups() failed: %m");
1343 if (setresgid(gid, gid, gid) < 0) {
1344 log_error("setregid() failed: %m");
1348 if (setresuid(uid, uid, uid) < 0) {
1349 log_error("setreuid() failed: %m");
1353 /* Reset everything fully to 0, just in case */
1355 if (setgroups(0, NULL) < 0) {
1356 log_error("setgroups() failed: %m");
1360 if (setresgid(0, 0, 0) < 0) {
1361 log_error("setregid() failed: %m");
1365 if (setresuid(0, 0, 0) < 0) {
1366 log_error("setreuid() failed: %m");
1371 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1372 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1373 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1378 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1379 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1385 if (fdset_size(fds) > 0) {
1386 k = fdset_cloexec(fds, false);
1388 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1392 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1393 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1405 /* Automatically search for the init system */
1407 l = 1 + argc - optind;
1408 a = newa(char*, l + 1);
1409 memcpy(a + 1, argv + optind, l * sizeof(char*));
1411 a[0] = (char*) "/usr/lib/systemd/systemd";
1412 execve(a[0], a, (char**) envp);
1414 a[0] = (char*) "/lib/systemd/systemd";
1415 execve(a[0], a, (char**) envp);
1417 a[0] = (char*) "/sbin/init";
1418 execve(a[0], a, (char**) envp);
1419 } else if (argc > optind)
1420 execvpe(argv[optind], argv + optind, (char**) envp);
1422 chdir(home ? home : "/root");
1423 execle("/bin/bash", "-bash", NULL, (char**) envp);
1426 log_error("execv() failed: %m");
1429 _exit(EXIT_FAILURE);
1435 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1443 /* Kill if it is not dead yet anyway */
1444 terminate_machine(pid);
1446 /* Redundant, but better safe than sorry */
1449 k = wait_for_terminate(pid, &status);
1457 if (status.si_code == CLD_EXITED) {
1458 r = status.si_status;
1459 if (status.si_status != 0) {
1460 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1464 log_debug("Container %s exited successfully.", arg_machine);
1466 } else if (status.si_code == CLD_KILLED &&
1467 status.si_status == SIGINT) {
1468 log_info("Container %s has been shut down.", arg_machine);
1471 } else if (status.si_code == CLD_KILLED &&
1472 status.si_status == SIGHUP) {
1473 log_info("Container %s is being rebooted.", arg_machine);
1475 } else if (status.si_code == CLD_KILLED ||
1476 status.si_code == CLD_DUMPED) {
1478 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1482 log_error("Container %s failed due to unknown reason.", arg_machine);
1492 free(arg_directory);