1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
55 #include "cgroup-util.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
60 #include "dev-setup.h"
71 typedef enum LinkJournal {
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
89 (1ULL << CAP_DAC_OVERRIDE) |
90 (1ULL << CAP_DAC_READ_SEARCH) |
91 (1ULL << CAP_FOWNER) |
92 (1ULL << CAP_FSETID) |
93 (1ULL << CAP_IPC_OWNER) |
96 (1ULL << CAP_LINUX_IMMUTABLE) |
97 (1ULL << CAP_NET_BIND_SERVICE) |
98 (1ULL << CAP_NET_BROADCAST) |
99 (1ULL << CAP_NET_RAW) |
100 (1ULL << CAP_SETGID) |
101 (1ULL << CAP_SETFCAP) |
102 (1ULL << CAP_SETPCAP) |
103 (1ULL << CAP_SETUID) |
104 (1ULL << CAP_SYS_ADMIN) |
105 (1ULL << CAP_SYS_CHROOT) |
106 (1ULL << CAP_SYS_NICE) |
107 (1ULL << CAP_SYS_PTRACE) |
108 (1ULL << CAP_SYS_TTY_CONFIG) |
109 (1ULL << CAP_SYS_RESOURCE) |
110 (1ULL << CAP_SYS_BOOT) |
111 (1ULL << CAP_AUDIT_WRITE) |
112 (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
116 static int help(void) {
118 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120 " -h --help Show this help\n"
121 " --version Print version string\n"
122 " -D --directory=NAME Root directory for the container\n"
123 " -b --boot Boot up full system (i.e. invoke init)\n"
124 " -u --user=USER Run the command under specified user or uid\n"
125 " --uuid=UUID Set a specific machine UUID for the container\n"
126 " -M --machine=NAME Set the machine name for the container\n"
127 " -S --slice=SLICE Place the container in the specified slice\n"
128 " --private-network Disable network in container\n"
129 " --read-only Mount the root directory read-only\n"
130 " --capability=CAP In addition to the default, retain specified\n"
132 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
133 " -j Equivalent to --link-journal=host\n"
134 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
136 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137 program_invocation_short_name);
142 static int parse_argv(int argc, char *argv[]) {
155 static const struct option options[] = {
156 { "help", no_argument, NULL, 'h' },
157 { "version", no_argument, NULL, ARG_VERSION },
158 { "directory", required_argument, NULL, 'D' },
159 { "user", required_argument, NULL, 'u' },
160 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
161 { "boot", no_argument, NULL, 'b' },
162 { "uuid", required_argument, NULL, ARG_UUID },
163 { "read-only", no_argument, NULL, ARG_READ_ONLY },
164 { "capability", required_argument, NULL, ARG_CAPABILITY },
165 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
166 { "bind", required_argument, NULL, ARG_BIND },
167 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
168 { "machine", required_argument, NULL, 'M' },
169 { "slice", required_argument, NULL, 'S' },
178 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
187 puts(PACKAGE_STRING);
188 puts(SYSTEMD_FEATURES);
193 arg_directory = canonicalize_file_name(optarg);
194 if (!arg_directory) {
195 log_error("Failed to canonicalize root directory.");
203 arg_user = strdup(optarg);
209 case ARG_PRIVATE_NETWORK:
210 arg_private_network = true;
218 r = sd_id128_from_string(optarg, &arg_uuid);
220 log_error("Invalid UUID: %s", optarg);
226 arg_slice = strdup(optarg);
233 if (!hostname_is_valid(optarg)) {
234 log_error("Invalid machine name: %s", optarg);
239 arg_machine = strdup(optarg);
246 arg_read_only = true;
249 case ARG_CAPABILITY: {
253 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
257 t = strndup(word, length);
261 if (cap_from_name(t, &cap) < 0) {
262 log_error("Failed to parse capability %s.", t);
268 arg_retain |= 1ULL << (uint64_t) cap;
275 arg_link_journal = LINK_GUEST;
278 case ARG_LINK_JOURNAL:
279 if (streq(optarg, "auto"))
280 arg_link_journal = LINK_AUTO;
281 else if (streq(optarg, "no"))
282 arg_link_journal = LINK_NO;
283 else if (streq(optarg, "guest"))
284 arg_link_journal = LINK_GUEST;
285 else if (streq(optarg, "host"))
286 arg_link_journal = LINK_HOST;
288 log_error("Failed to parse link journal mode %s", optarg);
296 _cleanup_free_ char *a = NULL, *b = NULL;
300 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
302 e = strchr(optarg, ':');
304 a = strndup(optarg, e - optarg);
314 if (!path_is_absolute(a) || !path_is_absolute(b)) {
315 log_error("Invalid bind mount specification: %s", optarg);
319 r = strv_extend(x, a);
323 r = strv_extend(x, b);
334 log_error("Unknown option code %c", c);
342 static int mount_all(const char *dest) {
344 typedef struct MountPoint {
353 static const MountPoint mount_table[] = {
354 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
355 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
356 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
357 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
358 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
359 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
360 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
361 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
363 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
364 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
371 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
372 _cleanup_free_ char *where = NULL;
375 where = strjoin(dest, "/", mount_table[k].where, NULL);
379 t = path_is_mount_point(where, true);
381 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
389 /* Skip this entry if it is not a remount. */
390 if (mount_table[k].what && t > 0)
393 mkdir_p(where, 0755);
395 if (mount(mount_table[k].what,
398 mount_table[k].flags,
399 mount_table[k].options) < 0 &&
400 mount_table[k].fatal) {
402 log_error("mount(%s) failed: %m", where);
412 static int mount_binds(const char *dest, char **l, unsigned long flags) {
415 STRV_FOREACH_PAIR(x, y, l) {
416 _cleanup_free_ char *where = NULL;
417 struct stat source_st, dest_st;
419 if (stat(*x, &source_st) < 0) {
420 log_error("failed to stat %s: %m", *x);
424 where = strjoin(dest, "/", *y, NULL);
428 if (stat(where, &dest_st) == 0) {
429 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
430 log_error("The file types of %s and %s do not match. Refusing bind mount",
435 /* Create the mount point, but be conservative -- refuse to create block
436 * and char devices. */
437 if (S_ISDIR(source_st.st_mode))
438 mkdir_p_label(where, 0755);
439 else if (S_ISFIFO(source_st.st_mode))
441 else if (S_ISSOCK(source_st.st_mode))
442 mknod(where, 0644 | S_IFSOCK, 0);
443 else if (S_ISREG(source_st.st_mode))
446 log_error("Refusing to create mountpoint for file: %s", *x);
451 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
452 log_error("mount(%s) failed: %m", where);
456 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
457 log_error("mount(%s) failed: %m", where);
465 static int setup_timezone(const char *dest) {
466 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
472 /* Fix the timezone, if possible */
473 r = readlink_malloc("/etc/localtime", &p);
475 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
479 z = path_startswith(p, "../usr/share/zoneinfo/");
481 z = path_startswith(p, "/usr/share/zoneinfo/");
483 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
487 where = strappend(dest, "/etc/localtime");
491 r = readlink_malloc(where, &q);
493 y = path_startswith(q, "../usr/share/zoneinfo/");
495 y = path_startswith(q, "/usr/share/zoneinfo/");
498 /* Already pointing to the right place? Then do nothing .. */
499 if (y && streq(y, z))
503 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
507 if (access(check, F_OK) < 0) {
508 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
512 what = strappend("../usr/share/zoneinfo/", z);
517 if (symlink(what, where) < 0) {
518 log_error("Failed to correct timezone of container: %m");
525 static int setup_resolv_conf(const char *dest) {
526 char _cleanup_free_ *where = NULL;
530 if (arg_private_network)
533 /* Fix resolv.conf, if possible */
534 where = strappend(dest, "/etc/resolv.conf");
538 /* We don't really care for the results of this really. If it
539 * fails, it fails, but meh... */
540 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
545 static int setup_boot_id(const char *dest) {
546 _cleanup_free_ char *from = NULL, *to = NULL;
553 /* Generate a new randomized boot ID, so that each boot-up of
554 * the container gets a new one */
556 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
557 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
561 r = sd_id128_randomize(&rnd);
563 log_error("Failed to generate random boot id: %s", strerror(-r));
567 snprintf(as_uuid, sizeof(as_uuid),
568 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
569 SD_ID128_FORMAT_VAL(rnd));
570 char_array_0(as_uuid);
572 r = write_string_file(from, as_uuid);
574 log_error("Failed to write boot id: %s", strerror(-r));
578 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
579 log_error("Failed to bind mount boot id: %m");
581 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
582 log_warning("Failed to make boot id read-only: %m");
588 static int copy_devnodes(const char *dest) {
590 static const char devnodes[] =
600 _cleanup_umask_ mode_t u;
606 NULSTR_FOREACH(d, devnodes) {
608 _cleanup_free_ char *from = NULL, *to = NULL;
610 asprintf(&from, "/dev/%s", d);
611 asprintf(&to, "%s/dev/%s", dest, d);
622 if (stat(from, &st) < 0) {
624 if (errno != ENOENT) {
625 log_error("Failed to stat %s: %m", from);
630 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
632 log_error("%s is not a char or block device, cannot copy", from);
636 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
638 log_error("mknod(%s) failed: %m", dest);
647 static int setup_ptmx(const char *dest) {
648 _cleanup_free_ char *p = NULL;
650 p = strappend(dest, "/dev/ptmx");
654 if (symlink("pts/ptmx", p) < 0) {
655 log_error("Failed to create /dev/ptmx symlink: %m");
662 static int setup_dev_console(const char *dest, const char *console) {
664 _cleanup_free_ char *to = NULL;
666 _cleanup_umask_ mode_t u;
673 if (stat(console, &st) < 0) {
674 log_error("Failed to stat %s: %m", console);
677 } else if (!S_ISCHR(st.st_mode)) {
678 log_error("/dev/console is not a char device");
682 r = chmod_and_chown(console, 0600, 0, 0);
684 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
688 if (asprintf(&to, "%s/dev/console", dest) < 0)
691 /* We need to bind mount the right tty to /dev/console since
692 * ptys can only exist on pts file systems. To have something
693 * to bind mount things on we create a device node first, that
694 * has the right major/minor (note that the major minor
695 * doesn't actually matter here, since we mount it over
698 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
699 log_error("mknod() for /dev/console failed: %m");
703 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
704 log_error("Bind mount for /dev/console failed: %m");
711 static int setup_kmsg(const char *dest, int kmsg_socket) {
712 _cleanup_free_ char *from = NULL, *to = NULL;
714 _cleanup_umask_ mode_t u;
716 struct cmsghdr cmsghdr;
717 uint8_t buf[CMSG_SPACE(sizeof(int))];
720 .msg_control = &control,
721 .msg_controllen = sizeof(control),
723 struct cmsghdr *cmsg;
726 assert(kmsg_socket >= 0);
730 /* We create the kmsg FIFO as /dev/kmsg, but immediately
731 * delete it after bind mounting it to /proc/kmsg. While FIFOs
732 * on the reading side behave very similar to /proc/kmsg,
733 * their writing side behaves differently from /dev/kmsg in
734 * that writing blocks when nothing is reading. In order to
735 * avoid any problems with containers deadlocking due to this
736 * we simply make /dev/kmsg unavailable to the container. */
737 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
738 asprintf(&to, "%s/proc/kmsg", dest) < 0)
741 if (mkfifo(from, 0600) < 0) {
742 log_error("mkfifo() for /dev/kmsg failed: %m");
746 r = chmod_and_chown(from, 0600, 0, 0);
748 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
752 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
753 log_error("Bind mount for /proc/kmsg failed: %m");
757 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
759 log_error("Failed to open fifo: %m");
763 cmsg = CMSG_FIRSTHDR(&mh);
764 cmsg->cmsg_level = SOL_SOCKET;
765 cmsg->cmsg_type = SCM_RIGHTS;
766 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
767 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
769 mh.msg_controllen = cmsg->cmsg_len;
771 /* Store away the fd in the socket, so that it stays open as
772 * long as we run the child */
773 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
774 close_nointr_nofail(fd);
777 log_error("Failed to send FIFO fd: %m");
781 /* And now make the FIFO unavailable as /dev/kmsg... */
786 static int setup_hostname(void) {
788 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
794 static int setup_journal(const char *directory) {
795 sd_id128_t machine_id;
796 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
800 if (arg_link_journal == LINK_NO)
803 p = strappend(directory, "/etc/machine-id");
807 r = read_one_line_file(p, &b);
808 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
811 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
816 if (isempty(id) && arg_link_journal == LINK_AUTO)
819 /* Verify validity */
820 r = sd_id128_from_string(id, &machine_id);
822 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
827 p = strappend("/var/log/journal/", id);
828 q = strjoin(directory, "/var/log/journal/", id, NULL);
832 if (path_is_mount_point(p, false) > 0) {
833 if (arg_link_journal != LINK_AUTO) {
834 log_error("%s: already a mount point, refusing to use for journal", p);
841 if (path_is_mount_point(q, false) > 0) {
842 if (arg_link_journal != LINK_AUTO) {
843 log_error("%s: already a mount point, refusing to use for journal", q);
850 r = readlink_and_make_absolute(p, &d);
852 if ((arg_link_journal == LINK_GUEST ||
853 arg_link_journal == LINK_AUTO) &&
856 r = mkdir_p(q, 0755);
858 log_warning("failed to create directory %s: %m", q);
863 log_error("Failed to remove symlink %s: %m", p);
866 } else if (r == -EINVAL) {
868 if (arg_link_journal == LINK_GUEST &&
871 if (errno == ENOTDIR) {
872 log_error("%s already exists and is neither a symlink nor a directory", p);
875 log_error("Failed to remove %s: %m", p);
879 } else if (r != -ENOENT) {
880 log_error("readlink(%s) failed: %m", p);
884 if (arg_link_journal == LINK_GUEST) {
886 if (symlink(q, p) < 0) {
887 log_error("Failed to symlink %s to %s: %m", q, p);
891 r = mkdir_p(q, 0755);
893 log_warning("failed to create directory %s: %m", q);
897 if (arg_link_journal == LINK_HOST) {
898 r = mkdir_p(p, 0755);
900 log_error("Failed to create %s: %m", p);
904 } else if (access(p, F_OK) < 0)
907 if (dir_is_empty(q) == 0) {
908 log_error("%s not empty.", q);
912 r = mkdir_p(q, 0755);
914 log_error("Failed to create %s: %m", q);
918 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
919 log_error("Failed to bind mount journal from host into guest: %m");
926 static int drop_capabilities(void) {
927 return capability_bounding_set_drop(~arg_retain, false);
930 static int register_machine(void) {
931 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
932 _cleanup_bus_unref_ sd_bus *bus = NULL;
935 r = sd_bus_open_system(&bus);
937 log_error("Failed to open system bus: %s", strerror(-r));
941 r = sd_bus_call_method(
943 "org.freedesktop.machine1",
944 "/org/freedesktop/machine1",
945 "org.freedesktop.machine1.Manager",
951 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
955 strempty(arg_directory),
956 !isempty(arg_slice), "Slice", "s", arg_slice);
958 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
965 static bool audit_enabled(void) {
968 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
970 close_nointr_nofail(fd);
976 int main(int argc, char *argv[]) {
978 int r = EXIT_FAILURE, k;
979 _cleanup_close_ int master = -1;
981 const char *console = NULL;
983 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
984 _cleanup_fdset_free_ FDSet *fds = NULL;
986 log_parse_environment();
989 k = parse_argv(argc, argv);
1000 p = path_make_absolute_cwd(arg_directory);
1001 free(arg_directory);
1004 arg_directory = get_current_dir_name();
1006 if (!arg_directory) {
1007 log_error("Failed to determine path, please use -D.");
1011 path_kill_slashes(arg_directory);
1014 arg_machine = strdup(path_get_file_name(arg_directory));
1020 hostname_cleanup(arg_machine, false);
1021 if (isempty(arg_machine)) {
1022 log_error("Failed to determine machine name automatically, please use -M.");
1027 if (geteuid() != 0) {
1028 log_error("Need to be root.");
1032 if (sd_booted() <= 0) {
1033 log_error("Not running on a systemd system.");
1037 if (arg_boot && audit_enabled()) {
1038 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1039 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1040 "line before using systemd-nspawn. Sleeping for 5s...\n");
1044 if (path_equal(arg_directory, "/")) {
1045 log_error("Spawning container on root directory not supported.");
1049 if (path_is_os_tree(arg_directory) <= 0) {
1050 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1055 n_fd_passed = sd_listen_fds(false);
1056 if (n_fd_passed > 0) {
1057 k = fdset_new_listen_fds(&fds, false);
1059 log_error("Failed to collect file descriptors: %s", strerror(-k));
1063 fdset_close_others(fds);
1066 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1068 log_error("Failed to acquire pseudo tty: %m");
1072 console = ptsname(master);
1074 log_error("Failed to determine tty name: %m");
1078 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1080 if (unlockpt(master) < 0) {
1081 log_error("Failed to unlock tty: %m");
1085 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1086 log_error("Failed to create kmsg socket pair.");
1090 sd_notify(0, "READY=1");
1092 assert_se(sigemptyset(&mask) == 0);
1093 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1094 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1099 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1101 if (errno == EINVAL)
1102 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1104 log_error("clone() failed: %m");
1111 const char *home = NULL;
1112 uid_t uid = (uid_t) -1;
1113 gid_t gid = (gid_t) -1;
1115 const char *envp[] = {
1116 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1117 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1122 NULL, /* container_uuid */
1123 NULL, /* LISTEN_FDS */
1124 NULL, /* LISTEN_PID */
1128 envp[n_env] = strv_find_prefix(environ, "TERM=");
1132 close_nointr_nofail(master);
1135 close_nointr(STDIN_FILENO);
1136 close_nointr(STDOUT_FILENO);
1137 close_nointr(STDERR_FILENO);
1139 close_nointr_nofail(kmsg_socket_pair[0]);
1140 kmsg_socket_pair[0] = -1;
1142 reset_all_signal_handlers();
1144 assert_se(sigemptyset(&mask) == 0);
1145 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1147 k = open_terminal(console, O_RDWR);
1148 if (k != STDIN_FILENO) {
1150 close_nointr_nofail(k);
1154 log_error("Failed to open console: %s", strerror(-k));
1158 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1159 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1160 log_error("Failed to duplicate console: %m");
1165 log_error("setsid() failed: %m");
1169 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1170 log_error("PR_SET_PDEATHSIG failed: %m");
1174 r = register_machine();
1178 /* Mark everything as slave, so that we still
1179 * receive mounts from the real root, but don't
1180 * propagate mounts to the real root. */
1181 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1182 log_error("MS_SLAVE|MS_REC failed: %m");
1186 /* Turn directory into bind mount */
1187 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1188 log_error("Failed to make bind mount.");
1193 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1194 log_error("Failed to make read-only.");
1198 if (mount_all(arg_directory) < 0)
1201 if (copy_devnodes(arg_directory) < 0)
1204 if (setup_ptmx(arg_directory) < 0)
1207 dev_setup(arg_directory);
1209 if (setup_dev_console(arg_directory, console) < 0)
1212 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1215 close_nointr_nofail(kmsg_socket_pair[1]);
1216 kmsg_socket_pair[1] = -1;
1218 if (setup_boot_id(arg_directory) < 0)
1221 if (setup_timezone(arg_directory) < 0)
1224 if (setup_resolv_conf(arg_directory) < 0)
1227 if (setup_journal(arg_directory) < 0)
1230 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1233 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1236 if (chdir(arg_directory) < 0) {
1237 log_error("chdir(%s) failed: %m", arg_directory);
1241 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1242 log_error("mount(MS_MOVE) failed: %m");
1246 if (chroot(".") < 0) {
1247 log_error("chroot() failed: %m");
1251 if (chdir("/") < 0) {
1252 log_error("chdir() failed: %m");
1260 if (drop_capabilities() < 0) {
1261 log_error("drop_capabilities() failed: %m");
1267 /* Note that this resolves user names
1268 * inside the container, and hence
1269 * accesses the NSS modules from the
1270 * container and not the host. This is
1273 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1274 log_error("get_user_creds() failed: %m");
1278 if (mkdir_parents_label(home, 0775) < 0) {
1279 log_error("mkdir_parents_label() failed: %m");
1283 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1284 log_error("mkdir_safe_label() failed: %m");
1288 if (initgroups((const char*)arg_user, gid) < 0) {
1289 log_error("initgroups() failed: %m");
1293 if (setresgid(gid, gid, gid) < 0) {
1294 log_error("setregid() failed: %m");
1298 if (setresuid(uid, uid, uid) < 0) {
1299 log_error("setreuid() failed: %m");
1303 /* Reset everything fully to 0, just in case */
1305 if (setgroups(0, NULL) < 0) {
1306 log_error("setgroups() failed: %m");
1310 if (setresgid(0, 0, 0) < 0) {
1311 log_error("setregid() failed: %m");
1315 if (setresuid(0, 0, 0) < 0) {
1316 log_error("setreuid() failed: %m");
1321 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1322 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1323 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1328 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1329 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1335 if (fdset_size(fds) > 0) {
1336 k = fdset_cloexec(fds, false);
1338 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1342 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1343 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1355 /* Automatically search for the init system */
1357 l = 1 + argc - optind;
1358 a = newa(char*, l + 1);
1359 memcpy(a + 1, argv + optind, l * sizeof(char*));
1361 a[0] = (char*) "/usr/lib/systemd/systemd";
1362 execve(a[0], a, (char**) envp);
1364 a[0] = (char*) "/lib/systemd/systemd";
1365 execve(a[0], a, (char**) envp);
1367 a[0] = (char*) "/sbin/init";
1368 execve(a[0], a, (char**) envp);
1369 } else if (argc > optind)
1370 execvpe(argv[optind], argv + optind, (char**) envp);
1372 chdir(home ? home : "/root");
1373 execle("/bin/bash", "-bash", NULL, (char**) envp);
1376 log_error("execv() failed: %m");
1379 _exit(EXIT_FAILURE);
1385 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1393 /* Kill if it is not dead yet anyway */
1396 k = wait_for_terminate(pid, &status);
1404 if (status.si_code == CLD_EXITED) {
1405 r = status.si_status;
1406 if (status.si_status != 0) {
1407 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1411 log_debug("Container %s exited successfully.", arg_machine);
1413 } else if (status.si_code == CLD_KILLED &&
1414 status.si_status == SIGINT) {
1415 log_info("Container %s has been shut down.", arg_machine);
1418 } else if (status.si_code == CLD_KILLED &&
1419 status.si_status == SIGHUP) {
1420 log_info("Container %s is being rebooted.", arg_machine);
1422 } else if (status.si_code == CLD_KILLED ||
1423 status.si_code == CLD_DUMPED) {
1425 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1429 log_error("Container %s failed due to unknown reason.", arg_machine);
1439 free(arg_directory);