1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include "sd-daemon.h"
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
62 #include "bus-error.h"
69 typedef enum LinkJournal {
76 static char *arg_directory = NULL;
77 static char *arg_user = NULL;
78 static sd_id128_t arg_uuid = {};
79 static char *arg_machine = NULL;
80 static const char *arg_slice = NULL;
81 static bool arg_private_network = false;
82 static bool arg_read_only = false;
83 static bool arg_boot = false;
84 static LinkJournal arg_link_journal = LINK_AUTO;
85 static uint64_t arg_retain =
87 (1ULL << CAP_DAC_OVERRIDE) |
88 (1ULL << CAP_DAC_READ_SEARCH) |
89 (1ULL << CAP_FOWNER) |
90 (1ULL << CAP_FSETID) |
91 (1ULL << CAP_IPC_OWNER) |
94 (1ULL << CAP_LINUX_IMMUTABLE) |
95 (1ULL << CAP_NET_BIND_SERVICE) |
96 (1ULL << CAP_NET_BROADCAST) |
97 (1ULL << CAP_NET_RAW) |
98 (1ULL << CAP_SETGID) |
99 (1ULL << CAP_SETFCAP) |
100 (1ULL << CAP_SETPCAP) |
101 (1ULL << CAP_SETUID) |
102 (1ULL << CAP_SYS_ADMIN) |
103 (1ULL << CAP_SYS_CHROOT) |
104 (1ULL << CAP_SYS_NICE) |
105 (1ULL << CAP_SYS_PTRACE) |
106 (1ULL << CAP_SYS_TTY_CONFIG) |
107 (1ULL << CAP_SYS_RESOURCE) |
108 (1ULL << CAP_SYS_BOOT) |
109 (1ULL << CAP_AUDIT_WRITE) |
110 (1ULL << CAP_AUDIT_CONTROL);
111 static char **arg_bind = NULL;
112 static char **arg_bind_ro = NULL;
114 static int help(void) {
116 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
117 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
118 " -h --help Show this help\n"
119 " --version Print version string\n"
120 " -D --directory=NAME Root directory for the container\n"
121 " -b --boot Boot up full system (i.e. invoke init)\n"
122 " -u --user=USER Run the command under specified user or uid\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " -S --slice=SLICE Place the container in the specified slice\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135 program_invocation_short_name);
140 static int parse_argv(int argc, char *argv[]) {
153 static const struct option options[] = {
154 { "help", no_argument, NULL, 'h' },
155 { "version", no_argument, NULL, ARG_VERSION },
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
159 { "boot", no_argument, NULL, 'b' },
160 { "uuid", required_argument, NULL, ARG_UUID },
161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
162 { "capability", required_argument, NULL, ARG_CAPABILITY },
163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
166 { "machine", required_argument, NULL, 'M' },
167 { "slice", required_argument, NULL, 'S' },
176 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
185 puts(PACKAGE_STRING);
186 puts(SYSTEMD_FEATURES);
191 arg_directory = canonicalize_file_name(optarg);
192 if (!arg_directory) {
193 log_error("Failed to canonicalize root directory.");
201 arg_user = strdup(optarg);
207 case ARG_PRIVATE_NETWORK:
208 arg_private_network = true;
216 r = sd_id128_from_string(optarg, &arg_uuid);
218 log_error("Invalid UUID: %s", optarg);
224 arg_slice = strdup(optarg);
231 if (!hostname_is_valid(optarg)) {
232 log_error("Invalid machine name: %s", optarg);
237 arg_machine = strdup(optarg);
244 arg_read_only = true;
247 case ARG_CAPABILITY: {
251 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
255 t = strndup(word, length);
259 if (cap_from_name(t, &cap) < 0) {
260 log_error("Failed to parse capability %s.", t);
266 arg_retain |= 1ULL << (uint64_t) cap;
273 arg_link_journal = LINK_GUEST;
276 case ARG_LINK_JOURNAL:
277 if (streq(optarg, "auto"))
278 arg_link_journal = LINK_AUTO;
279 else if (streq(optarg, "no"))
280 arg_link_journal = LINK_NO;
281 else if (streq(optarg, "guest"))
282 arg_link_journal = LINK_GUEST;
283 else if (streq(optarg, "host"))
284 arg_link_journal = LINK_HOST;
286 log_error("Failed to parse link journal mode %s", optarg);
294 _cleanup_free_ char *a = NULL, *b = NULL;
298 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
300 e = strchr(optarg, ':');
302 a = strndup(optarg, e - optarg);
312 if (!path_is_absolute(a) || !path_is_absolute(b)) {
313 log_error("Invalid bind mount specification: %s", optarg);
317 r = strv_extend(x, a);
321 r = strv_extend(x, b);
332 log_error("Unknown option code %c", c);
340 static int mount_all(const char *dest) {
342 typedef struct MountPoint {
351 static const MountPoint mount_table[] = {
352 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
353 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
354 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
355 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
356 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
357 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
358 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
359 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
361 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
362 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
369 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
370 _cleanup_free_ char *where = NULL;
373 where = strjoin(dest, "/", mount_table[k].where, NULL);
377 t = path_is_mount_point(where, true);
379 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387 /* Skip this entry if it is not a remount. */
388 if (mount_table[k].what && t > 0)
391 mkdir_p(where, 0755);
393 if (mount(mount_table[k].what,
396 mount_table[k].flags,
397 mount_table[k].options) < 0 &&
398 mount_table[k].fatal) {
400 log_error("mount(%s) failed: %m", where);
410 static int mount_binds(const char *dest, char **l, unsigned long flags) {
413 STRV_FOREACH_PAIR(x, y, l) {
414 _cleanup_free_ char *where = NULL;
415 struct stat source_st, dest_st;
417 if (stat(*x, &source_st) < 0) {
418 log_error("failed to stat %s: %m", *x);
422 where = strjoin(dest, "/", *y, NULL);
426 if (stat(where, &dest_st) == 0) {
427 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
428 log_error("The file types of %s and %s do not match. Refusing bind mount",
433 /* Create the mount point, but be conservative -- refuse to create block
434 * and char devices. */
435 if (S_ISDIR(source_st.st_mode))
436 mkdir_p_label(where, 0755);
437 else if (S_ISFIFO(source_st.st_mode))
439 else if (S_ISSOCK(source_st.st_mode))
440 mknod(where, 0644 | S_IFSOCK, 0);
441 else if (S_ISREG(source_st.st_mode))
444 log_error("Refusing to create mountpoint for file: %s", *x);
449 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
450 log_error("mount(%s) failed: %m", where);
454 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
455 log_error("mount(%s) failed: %m", where);
463 static int setup_timezone(const char *dest) {
464 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
470 /* Fix the timezone, if possible */
471 r = readlink_malloc("/etc/localtime", &p);
473 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
477 z = path_startswith(p, "../usr/share/zoneinfo/");
479 z = path_startswith(p, "/usr/share/zoneinfo/");
481 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
485 where = strappend(dest, "/etc/localtime");
489 r = readlink_malloc(where, &q);
491 y = path_startswith(q, "../usr/share/zoneinfo/");
493 y = path_startswith(q, "/usr/share/zoneinfo/");
496 /* Already pointing to the right place? Then do nothing .. */
497 if (y && streq(y, z))
501 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
505 if (access(check, F_OK) < 0) {
506 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
510 what = strappend("../usr/share/zoneinfo/", z);
515 if (symlink(what, where) < 0) {
516 log_error("Failed to correct timezone of container: %m");
523 static int setup_resolv_conf(const char *dest) {
524 char _cleanup_free_ *where = NULL;
528 if (arg_private_network)
531 /* Fix resolv.conf, if possible */
532 where = strappend(dest, "/etc/resolv.conf");
536 /* We don't really care for the results of this really. If it
537 * fails, it fails, but meh... */
538 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
543 static int setup_boot_id(const char *dest) {
544 _cleanup_free_ char *from = NULL, *to = NULL;
551 /* Generate a new randomized boot ID, so that each boot-up of
552 * the container gets a new one */
554 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
555 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
559 r = sd_id128_randomize(&rnd);
561 log_error("Failed to generate random boot id: %s", strerror(-r));
565 snprintf(as_uuid, sizeof(as_uuid),
566 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
567 SD_ID128_FORMAT_VAL(rnd));
568 char_array_0(as_uuid);
570 r = write_string_file(from, as_uuid);
572 log_error("Failed to write boot id: %s", strerror(-r));
576 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
577 log_error("Failed to bind mount boot id: %m");
579 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
580 log_warning("Failed to make boot id read-only: %m");
586 static int copy_devnodes(const char *dest) {
588 static const char devnodes[] =
598 _cleanup_umask_ mode_t u;
604 NULSTR_FOREACH(d, devnodes) {
606 _cleanup_free_ char *from = NULL, *to = NULL;
608 asprintf(&from, "/dev/%s", d);
609 asprintf(&to, "%s/dev/%s", dest, d);
620 if (stat(from, &st) < 0) {
622 if (errno != ENOENT) {
623 log_error("Failed to stat %s: %m", from);
628 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
630 log_error("%s is not a char or block device, cannot copy", from);
634 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
636 log_error("mknod(%s) failed: %m", dest);
645 static int setup_ptmx(const char *dest) {
646 _cleanup_free_ char *p = NULL;
648 p = strappend(dest, "/dev/ptmx");
652 if (symlink("pts/ptmx", p) < 0) {
653 log_error("Failed to create /dev/ptmx symlink: %m");
660 static int setup_dev_console(const char *dest, const char *console) {
662 _cleanup_free_ char *to = NULL;
664 _cleanup_umask_ mode_t u;
671 if (stat(console, &st) < 0) {
672 log_error("Failed to stat %s: %m", console);
675 } else if (!S_ISCHR(st.st_mode)) {
676 log_error("/dev/console is not a char device");
680 r = chmod_and_chown(console, 0600, 0, 0);
682 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
686 if (asprintf(&to, "%s/dev/console", dest) < 0)
689 /* We need to bind mount the right tty to /dev/console since
690 * ptys can only exist on pts file systems. To have something
691 * to bind mount things on we create a device node first, that
692 * has the right major/minor (note that the major minor
693 * doesn't actually matter here, since we mount it over
696 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
697 log_error("mknod() for /dev/console failed: %m");
701 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
702 log_error("Bind mount for /dev/console failed: %m");
709 static int setup_kmsg(const char *dest, int kmsg_socket) {
710 _cleanup_free_ char *from = NULL, *to = NULL;
712 _cleanup_umask_ mode_t u;
714 struct cmsghdr cmsghdr;
715 uint8_t buf[CMSG_SPACE(sizeof(int))];
718 .msg_control = &control,
719 .msg_controllen = sizeof(control),
721 struct cmsghdr *cmsg;
724 assert(kmsg_socket >= 0);
728 /* We create the kmsg FIFO as /dev/kmsg, but immediately
729 * delete it after bind mounting it to /proc/kmsg. While FIFOs
730 * on the reading side behave very similar to /proc/kmsg,
731 * their writing side behaves differently from /dev/kmsg in
732 * that writing blocks when nothing is reading. In order to
733 * avoid any problems with containers deadlocking due to this
734 * we simply make /dev/kmsg unavailable to the container. */
735 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
736 asprintf(&to, "%s/proc/kmsg", dest) < 0)
739 if (mkfifo(from, 0600) < 0) {
740 log_error("mkfifo() for /dev/kmsg failed: %m");
744 r = chmod_and_chown(from, 0600, 0, 0);
746 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
750 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
751 log_error("Bind mount for /proc/kmsg failed: %m");
755 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
757 log_error("Failed to open fifo: %m");
761 cmsg = CMSG_FIRSTHDR(&mh);
762 cmsg->cmsg_level = SOL_SOCKET;
763 cmsg->cmsg_type = SCM_RIGHTS;
764 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
765 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
767 mh.msg_controllen = cmsg->cmsg_len;
769 /* Store away the fd in the socket, so that it stays open as
770 * long as we run the child */
771 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
772 close_nointr_nofail(fd);
775 log_error("Failed to send FIFO fd: %m");
779 /* And now make the FIFO unavailable as /dev/kmsg... */
784 static int setup_hostname(void) {
786 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
792 static int setup_journal(const char *directory) {
793 sd_id128_t machine_id;
794 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
798 if (arg_link_journal == LINK_NO)
801 p = strappend(directory, "/etc/machine-id");
805 r = read_one_line_file(p, &b);
806 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
809 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
814 if (isempty(id) && arg_link_journal == LINK_AUTO)
817 /* Verify validity */
818 r = sd_id128_from_string(id, &machine_id);
820 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
825 p = strappend("/var/log/journal/", id);
826 q = strjoin(directory, "/var/log/journal/", id, NULL);
830 if (path_is_mount_point(p, false) > 0) {
831 if (arg_link_journal != LINK_AUTO) {
832 log_error("%s: already a mount point, refusing to use for journal", p);
839 if (path_is_mount_point(q, false) > 0) {
840 if (arg_link_journal != LINK_AUTO) {
841 log_error("%s: already a mount point, refusing to use for journal", q);
848 r = readlink_and_make_absolute(p, &d);
850 if ((arg_link_journal == LINK_GUEST ||
851 arg_link_journal == LINK_AUTO) &&
854 r = mkdir_p(q, 0755);
856 log_warning("failed to create directory %s: %m", q);
861 log_error("Failed to remove symlink %s: %m", p);
864 } else if (r == -EINVAL) {
866 if (arg_link_journal == LINK_GUEST &&
869 if (errno == ENOTDIR) {
870 log_error("%s already exists and is neither a symlink nor a directory", p);
873 log_error("Failed to remove %s: %m", p);
877 } else if (r != -ENOENT) {
878 log_error("readlink(%s) failed: %m", p);
882 if (arg_link_journal == LINK_GUEST) {
884 if (symlink(q, p) < 0) {
885 log_error("Failed to symlink %s to %s: %m", q, p);
889 r = mkdir_p(q, 0755);
891 log_warning("failed to create directory %s: %m", q);
895 if (arg_link_journal == LINK_HOST) {
896 r = mkdir_p(p, 0755);
898 log_error("Failed to create %s: %m", p);
902 } else if (access(p, F_OK) < 0)
905 if (dir_is_empty(q) == 0) {
906 log_error("%s not empty.", q);
910 r = mkdir_p(q, 0755);
912 log_error("Failed to create %s: %m", q);
916 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
917 log_error("Failed to bind mount journal from host into guest: %m");
924 static int drop_capabilities(void) {
925 return capability_bounding_set_drop(~arg_retain, false);
928 static int register_machine(void) {
929 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
930 _cleanup_bus_unref_ sd_bus *bus = NULL;
933 r = sd_bus_open_system(&bus);
935 log_error("Failed to open system bus: %s", strerror(-r));
939 r = sd_bus_call_method(
941 "org.freedesktop.machine1",
942 "/org/freedesktop/machine1",
943 "org.freedesktop.machine1.Manager",
949 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
953 strempty(arg_directory),
954 !isempty(arg_slice), "Slice", "s", arg_slice);
956 log_error("Failed to register machine: %s", bus_error_message(&error, r));
963 static int terminate_machine(pid_t pid) {
964 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
965 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
966 _cleanup_bus_unref_ sd_bus *bus = NULL;
970 r = sd_bus_open_system(&bus);
972 log_error("Failed to open system bus: %s", strerror(-r));
976 r = sd_bus_call_method(
978 "org.freedesktop.machine1",
979 "/org/freedesktop/machine1",
980 "org.freedesktop.machine1.Manager",
987 /* Note that the machine might already have been
988 * cleaned up automatically, hence don't consider it a
989 * failure if we cannot get the machine object. */
990 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
994 r = sd_bus_message_read(reply, "o", &path);
996 log_error("Failed to parse GetMachineByPID() reply: %s", bus_error_message(&error, r));
1000 r = sd_bus_call_method(
1002 "org.freedesktop.machine1",
1004 "org.freedesktop.machine1.Machine",
1010 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1017 static bool audit_enabled(void) {
1020 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1022 close_nointr_nofail(fd);
1028 int main(int argc, char *argv[]) {
1030 int r = EXIT_FAILURE, k;
1031 _cleanup_close_ int master = -1;
1033 const char *console = NULL;
1035 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1036 _cleanup_fdset_free_ FDSet *fds = NULL;
1038 log_parse_environment();
1041 k = parse_argv(argc, argv);
1049 if (arg_directory) {
1052 p = path_make_absolute_cwd(arg_directory);
1053 free(arg_directory);
1056 arg_directory = get_current_dir_name();
1058 if (!arg_directory) {
1059 log_error("Failed to determine path, please use -D.");
1063 path_kill_slashes(arg_directory);
1066 arg_machine = strdup(path_get_file_name(arg_directory));
1072 hostname_cleanup(arg_machine, false);
1073 if (isempty(arg_machine)) {
1074 log_error("Failed to determine machine name automatically, please use -M.");
1079 if (geteuid() != 0) {
1080 log_error("Need to be root.");
1084 if (sd_booted() <= 0) {
1085 log_error("Not running on a systemd system.");
1089 if (arg_boot && audit_enabled()) {
1090 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1091 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1092 "line before using systemd-nspawn. Sleeping for 5s...\n");
1096 if (path_equal(arg_directory, "/")) {
1097 log_error("Spawning container on root directory not supported.");
1101 if (path_is_os_tree(arg_directory) <= 0) {
1102 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1107 n_fd_passed = sd_listen_fds(false);
1108 if (n_fd_passed > 0) {
1109 k = fdset_new_listen_fds(&fds, false);
1111 log_error("Failed to collect file descriptors: %s", strerror(-k));
1115 fdset_close_others(fds);
1118 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1120 log_error("Failed to acquire pseudo tty: %m");
1124 console = ptsname(master);
1126 log_error("Failed to determine tty name: %m");
1130 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1132 if (unlockpt(master) < 0) {
1133 log_error("Failed to unlock tty: %m");
1137 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1138 log_error("Failed to create kmsg socket pair.");
1142 sd_notify(0, "READY=1");
1144 assert_se(sigemptyset(&mask) == 0);
1145 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1146 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1151 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1153 if (errno == EINVAL)
1154 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1156 log_error("clone() failed: %m");
1163 const char *home = NULL;
1164 uid_t uid = (uid_t) -1;
1165 gid_t gid = (gid_t) -1;
1167 const char *envp[] = {
1168 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1169 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1174 NULL, /* container_uuid */
1175 NULL, /* LISTEN_FDS */
1176 NULL, /* LISTEN_PID */
1180 envp[n_env] = strv_find_prefix(environ, "TERM=");
1184 close_nointr_nofail(master);
1187 close_nointr(STDIN_FILENO);
1188 close_nointr(STDOUT_FILENO);
1189 close_nointr(STDERR_FILENO);
1191 close_nointr_nofail(kmsg_socket_pair[0]);
1192 kmsg_socket_pair[0] = -1;
1194 reset_all_signal_handlers();
1196 assert_se(sigemptyset(&mask) == 0);
1197 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1199 k = open_terminal(console, O_RDWR);
1200 if (k != STDIN_FILENO) {
1202 close_nointr_nofail(k);
1206 log_error("Failed to open console: %s", strerror(-k));
1210 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1211 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1212 log_error("Failed to duplicate console: %m");
1217 log_error("setsid() failed: %m");
1221 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1222 log_error("PR_SET_PDEATHSIG failed: %m");
1226 r = register_machine();
1230 /* Mark everything as slave, so that we still
1231 * receive mounts from the real root, but don't
1232 * propagate mounts to the real root. */
1233 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1234 log_error("MS_SLAVE|MS_REC failed: %m");
1238 /* Turn directory into bind mount */
1239 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1240 log_error("Failed to make bind mount.");
1245 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1246 log_error("Failed to make read-only.");
1250 if (mount_all(arg_directory) < 0)
1253 if (copy_devnodes(arg_directory) < 0)
1256 if (setup_ptmx(arg_directory) < 0)
1259 dev_setup(arg_directory);
1261 if (setup_dev_console(arg_directory, console) < 0)
1264 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1267 close_nointr_nofail(kmsg_socket_pair[1]);
1268 kmsg_socket_pair[1] = -1;
1270 if (setup_boot_id(arg_directory) < 0)
1273 if (setup_timezone(arg_directory) < 0)
1276 if (setup_resolv_conf(arg_directory) < 0)
1279 if (setup_journal(arg_directory) < 0)
1282 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1285 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1288 if (chdir(arg_directory) < 0) {
1289 log_error("chdir(%s) failed: %m", arg_directory);
1293 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1294 log_error("mount(MS_MOVE) failed: %m");
1298 if (chroot(".") < 0) {
1299 log_error("chroot() failed: %m");
1303 if (chdir("/") < 0) {
1304 log_error("chdir() failed: %m");
1312 if (drop_capabilities() < 0) {
1313 log_error("drop_capabilities() failed: %m");
1319 /* Note that this resolves user names
1320 * inside the container, and hence
1321 * accesses the NSS modules from the
1322 * container and not the host. This is
1325 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1326 log_error("get_user_creds() failed: %m");
1330 if (mkdir_parents_label(home, 0775) < 0) {
1331 log_error("mkdir_parents_label() failed: %m");
1335 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1336 log_error("mkdir_safe_label() failed: %m");
1340 if (initgroups((const char*)arg_user, gid) < 0) {
1341 log_error("initgroups() failed: %m");
1345 if (setresgid(gid, gid, gid) < 0) {
1346 log_error("setregid() failed: %m");
1350 if (setresuid(uid, uid, uid) < 0) {
1351 log_error("setreuid() failed: %m");
1355 /* Reset everything fully to 0, just in case */
1357 if (setgroups(0, NULL) < 0) {
1358 log_error("setgroups() failed: %m");
1362 if (setresgid(0, 0, 0) < 0) {
1363 log_error("setregid() failed: %m");
1367 if (setresuid(0, 0, 0) < 0) {
1368 log_error("setreuid() failed: %m");
1373 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1374 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1375 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1380 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1381 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1387 if (fdset_size(fds) > 0) {
1388 k = fdset_cloexec(fds, false);
1390 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1394 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1395 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1407 /* Automatically search for the init system */
1409 l = 1 + argc - optind;
1410 a = newa(char*, l + 1);
1411 memcpy(a + 1, argv + optind, l * sizeof(char*));
1413 a[0] = (char*) "/usr/lib/systemd/systemd";
1414 execve(a[0], a, (char**) envp);
1416 a[0] = (char*) "/lib/systemd/systemd";
1417 execve(a[0], a, (char**) envp);
1419 a[0] = (char*) "/sbin/init";
1420 execve(a[0], a, (char**) envp);
1421 } else if (argc > optind)
1422 execvpe(argv[optind], argv + optind, (char**) envp);
1424 chdir(home ? home : "/root");
1425 execle("/bin/bash", "-bash", NULL, (char**) envp);
1428 log_error("execv() failed: %m");
1431 _exit(EXIT_FAILURE);
1437 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1445 /* Kill if it is not dead yet anyway */
1446 terminate_machine(pid);
1448 /* Redundant, but better safe than sorry */
1451 k = wait_for_terminate(pid, &status);
1459 if (status.si_code == CLD_EXITED) {
1460 r = status.si_status;
1461 if (status.si_status != 0) {
1462 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1466 log_debug("Container %s exited successfully.", arg_machine);
1468 } else if (status.si_code == CLD_KILLED &&
1469 status.si_status == SIGINT) {
1470 log_info("Container %s has been shut down.", arg_machine);
1473 } else if (status.si_code == CLD_KILLED &&
1474 status.si_status == SIGHUP) {
1475 log_info("Container %s is being rebooted.", arg_machine);
1477 } else if (status.si_code == CLD_KILLED ||
1478 status.si_code == CLD_DUMPED) {
1480 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1484 log_error("Container %s failed due to unknown reason.", arg_machine);
1494 free(arg_directory);