1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
56 #include "dev-setup.h"
58 typedef enum LinkJournal {
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
75 (1ULL << CAP_DAC_OVERRIDE) |
76 (1ULL << CAP_DAC_READ_SEARCH) |
77 (1ULL << CAP_FOWNER) |
78 (1ULL << CAP_FSETID) |
79 (1ULL << CAP_IPC_OWNER) |
82 (1ULL << CAP_LINUX_IMMUTABLE) |
83 (1ULL << CAP_NET_BIND_SERVICE) |
84 (1ULL << CAP_NET_BROADCAST) |
85 (1ULL << CAP_NET_RAW) |
86 (1ULL << CAP_SETGID) |
87 (1ULL << CAP_SETFCAP) |
88 (1ULL << CAP_SETPCAP) |
89 (1ULL << CAP_SETUID) |
90 (1ULL << CAP_SYS_ADMIN) |
91 (1ULL << CAP_SYS_CHROOT) |
92 (1ULL << CAP_SYS_NICE) |
93 (1ULL << CAP_SYS_PTRACE) |
94 (1ULL << CAP_SYS_TTY_CONFIG) |
95 (1ULL << CAP_SYS_RESOURCE);
97 static int help(void) {
99 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101 " -h --help Show this help\n"
102 " -D --directory=NAME Root directory for the container\n"
103 " -b --boot Boot up full system (i.e. invoke init)\n"
104 " -u --user=USER Run the command under specified user or uid\n"
105 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
106 " --uuid=UUID Set a specific machine UUID for the container\n"
107 " --private-network Disable network in container\n"
108 " --read-only Mount the root directory read-only\n"
109 " --capability=CAP In addition to the default, retain specified capability\n"
110 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
111 " -j Equivalent to --link-journal=host\n",
112 program_invocation_short_name);
117 static int parse_argv(int argc, char *argv[]) {
120 ARG_PRIVATE_NETWORK = 0x100,
127 static const struct option options[] = {
128 { "help", no_argument, NULL, 'h' },
129 { "directory", required_argument, NULL, 'D' },
130 { "user", required_argument, NULL, 'u' },
131 { "controllers", required_argument, NULL, 'C' },
132 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
133 { "boot", no_argument, NULL, 'b' },
134 { "uuid", required_argument, NULL, ARG_UUID },
135 { "read-only", no_argument, NULL, ARG_READ_ONLY },
136 { "capability", required_argument, NULL, ARG_CAPABILITY },
137 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
146 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
156 arg_directory = canonicalize_file_name(optarg);
157 if (!arg_directory) {
158 log_error("Failed to canonicalize root directory.");
166 if (!(arg_user = strdup(optarg))) {
167 log_error("Failed to duplicate user name.");
174 strv_free(arg_controllers);
175 arg_controllers = strv_split(optarg, ",");
176 if (!arg_controllers) {
177 log_error("Failed to split controllers list.");
180 strv_uniq(arg_controllers);
184 case ARG_PRIVATE_NETWORK:
185 arg_private_network = true;
197 arg_read_only = true;
200 case ARG_CAPABILITY: {
204 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
208 t = strndup(word, length);
212 if (cap_from_name(t, &cap) < 0) {
213 log_error("Failed to parse capability %s.", t);
219 arg_retain |= 1ULL << (uint64_t) cap;
226 arg_link_journal = LINK_GUEST;
229 case ARG_LINK_JOURNAL:
230 if (streq(optarg, "auto"))
231 arg_link_journal = LINK_AUTO;
232 else if (streq(optarg, "no"))
233 arg_link_journal = LINK_NO;
234 else if (streq(optarg, "guest"))
235 arg_link_journal = LINK_GUEST;
236 else if (streq(optarg, "host"))
237 arg_link_journal = LINK_HOST;
239 log_error("Failed to parse link journal mode %s", optarg);
249 log_error("Unknown option code %c", c);
257 static int mount_all(const char *dest) {
259 typedef struct MountPoint {
268 static const MountPoint mount_table[] = {
269 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
270 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
271 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
272 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
273 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
274 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
275 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
277 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
278 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
286 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
298 t = path_is_mount_point(where, true);
300 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
312 mkdir_p_label(where, 0755);
314 if (mount(mount_table[k].what,
317 mount_table[k].flags,
318 mount_table[k].options) < 0 &&
319 mount_table[k].fatal) {
321 log_error("mount(%s) failed: %m", where);
333 static int setup_timezone(const char *dest) {
338 /* Fix the timezone, if possible */
339 if (asprintf(&where, "%s/etc/localtime", dest) < 0)
342 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
343 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
347 if (asprintf(&where, "%s/etc/timezone", dest) < 0)
350 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
351 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
358 static int setup_resolv_conf(const char *dest) {
363 if (arg_private_network)
366 /* Fix resolv.conf, if possible */
367 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
371 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
372 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
379 static int copy_devnodes(const char *dest) {
381 static const char devnodes[] =
399 NULSTR_FOREACH(d, devnodes) {
401 char *from = NULL, *to = NULL;
403 asprintf(&from, "/dev/%s", d);
404 asprintf(&to, "%s/dev/%s", dest, d);
407 log_error("Failed to allocate devnode path");
420 if (stat(from, &st) < 0) {
422 if (errno != ENOENT) {
423 log_error("Failed to stat %s: %m", from);
428 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
430 log_error("%s is not a char or block device, cannot copy.", from);
434 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
436 log_error("mknod(%s) failed: %m", dest);
450 static int setup_dev_console(const char *dest, const char *console) {
461 if (stat(console, &st) < 0) {
462 log_error("Failed to stat %s: %m", console);
466 } else if (!S_ISCHR(st.st_mode)) {
467 log_error("/dev/console is not a char device.");
472 r = chmod_and_chown(console, 0600, 0, 0);
474 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
478 if (asprintf(&to, "%s/dev/console", dest) < 0) {
483 /* We need to bind mount the right tty to /dev/console since
484 * ptys can only exist on pts file systems. To have something
485 * to bind mount things on we create a device node first, that
486 * has the right major/minor (note that the major minor
487 * doesn't actually matter here, since we mount it over
490 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
491 log_error("mknod() for /dev/console failed: %m");
496 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
497 log_error("Bind mount for /dev/console failed: %m");
509 static int setup_kmsg(const char *dest, int kmsg_socket) {
510 char *from = NULL, *to = NULL;
514 struct cmsghdr cmsghdr;
515 uint8_t buf[CMSG_SPACE(sizeof(int))];
518 struct cmsghdr *cmsg;
521 assert(kmsg_socket >= 0);
525 /* We create the kmsg FIFO as /dev/kmsg, but immediately
526 * delete it after bind mounting it to /proc/kmsg. While FIFOs
527 * on the reading side behave very similar to /proc/kmsg,
528 * their writing side behaves differently from /dev/kmsg in
529 * that writing blocks when nothing is reading. In order to
530 * avoid any problems with containers deadlocking due to this
531 * we simply make /dev/kmsg unavailable to the container. */
532 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
537 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
542 if (mkfifo(from, 0600) < 0) {
543 log_error("mkfifo() for /dev/kmsg failed: %m");
548 r = chmod_and_chown(from, 0600, 0, 0);
550 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
554 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
555 log_error("Bind mount for /proc/kmsg failed: %m");
560 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
562 log_error("Failed to open fifo: %m");
570 mh.msg_control = &control;
571 mh.msg_controllen = sizeof(control);
573 cmsg = CMSG_FIRSTHDR(&mh);
574 cmsg->cmsg_level = SOL_SOCKET;
575 cmsg->cmsg_type = SCM_RIGHTS;
576 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
577 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
579 mh.msg_controllen = cmsg->cmsg_len;
581 /* Store away the fd in the socket, so that it stays open as
582 * long as we run the child */
583 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
584 close_nointr_nofail(fd);
587 log_error("Failed to send FIFO fd: %m");
592 /* And now make the FIFO unavailable as /dev/kmsg... */
603 static int setup_hostname(void) {
607 hn = path_get_file_name(arg_directory);
613 hostname_cleanup(hn);
616 if (sethostname(hn, strlen(hn)) < 0)
625 static int setup_journal(const char *directory) {
626 sd_id128_t machine_id;
627 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
630 if (arg_link_journal == LINK_NO)
633 p = strappend(directory, "/etc/machine-id");
639 r = read_one_line_file(p, &b);
640 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
644 log_error("Failed to read machine ID: %s", strerror(-r));
649 if (isempty(l) && arg_link_journal == LINK_AUTO) {
654 /* Verify validaty */
655 r = sd_id128_from_string(l, &machine_id);
657 log_error("Failed to parse machine ID: %s", strerror(-r));
662 p = strappend("/var/log/journal/", l);
663 q = strjoin(directory, "/var/log/journal/", l, NULL);
669 if (path_is_mount_point(p, false) > 0 ||
670 path_is_mount_point(q, false) > 0) {
671 if (arg_link_journal != LINK_AUTO) {
672 log_error("Journal already a mount point, refusing.");
681 r = readlink_and_make_absolute(p, &d);
683 if ((arg_link_journal == LINK_GUEST ||
684 arg_link_journal == LINK_AUTO) &&
694 log_error("Failed to remove symlink %s: %m", p);
698 } else if (r == -EINVAL) {
700 if (arg_link_journal == LINK_GUEST &&
703 if (errno == ENOTDIR)
704 log_error("%s already exists and is neither symlink nor directory.", p);
706 log_error("Failed to remove %s: %m", p);
712 } else if (r != -ENOENT) {
713 log_error("readlink(%s) failed: %m", p);
717 if (arg_link_journal == LINK_GUEST) {
719 if (symlink(q, p) < 0) {
720 log_error("Failed to symlink %s to %s: %m", q, p);
731 if (arg_link_journal == LINK_HOST) {
732 r = mkdir_p(p, 0755);
734 log_error("Failed to create %s: %m", p);
738 } else if (access(p, F_OK) < 0) {
743 if (dir_is_empty(q) == 0) {
744 log_error("%s not empty.", q);
749 r = mkdir_p(q, 0755);
751 log_error("Failed to create %s: %m", q);
755 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
756 log_error("Failed to bind mount journal from host into guest: %m");
772 static int drop_capabilities(void) {
773 return capability_bounding_set_drop(~arg_retain, false);
776 static int is_os_tree(const char *path) {
779 /* We use /bin/sh as flag file if something is an OS */
781 if (asprintf(&p, "%s/bin/sh", path) < 0)
787 return r < 0 ? 0 : 1;
790 static int process_pty(int master, sigset_t *mask) {
792 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
793 size_t in_buffer_full = 0, out_buffer_full = 0;
794 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
795 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
796 int ep = -1, signal_fd = -1, r;
798 fd_nonblock(STDIN_FILENO, 1);
799 fd_nonblock(STDOUT_FILENO, 1);
800 fd_nonblock(master, 1);
802 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
804 log_error("signalfd(): %m");
809 ep = epoll_create1(EPOLL_CLOEXEC);
811 log_error("Failed to create epoll: %m");
817 stdin_ev.events = EPOLLIN|EPOLLET;
818 stdin_ev.data.fd = STDIN_FILENO;
821 stdout_ev.events = EPOLLOUT|EPOLLET;
822 stdout_ev.data.fd = STDOUT_FILENO;
825 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
826 master_ev.data.fd = master;
829 signal_ev.events = EPOLLIN;
830 signal_ev.data.fd = signal_fd;
832 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
833 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
834 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
835 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
836 log_error("Failed to regiser fds in epoll: %m");
842 struct epoll_event ev[16];
846 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
849 if (errno == EINTR || errno == EAGAIN)
852 log_error("epoll_wait(): %m");
859 for (i = 0; i < nfds; i++) {
860 if (ev[i].data.fd == STDIN_FILENO) {
862 if (ev[i].events & (EPOLLIN|EPOLLHUP))
863 stdin_readable = true;
865 } else if (ev[i].data.fd == STDOUT_FILENO) {
867 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
868 stdout_writable = true;
870 } else if (ev[i].data.fd == master) {
872 if (ev[i].events & (EPOLLIN|EPOLLHUP))
873 master_readable = true;
875 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
876 master_writable = true;
878 } else if (ev[i].data.fd == signal_fd) {
879 struct signalfd_siginfo sfsi;
882 n = read(signal_fd, &sfsi, sizeof(sfsi));
883 if (n != sizeof(sfsi)) {
886 log_error("Failed to read from signalfd: invalid block size");
891 if (errno != EINTR && errno != EAGAIN) {
892 log_error("Failed to read from signalfd: %m");
898 if (sfsi.ssi_signo == SIGWINCH) {
901 /* The window size changed, let's forward that. */
902 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
903 ioctl(master, TIOCSWINSZ, &ws);
912 while ((stdin_readable && in_buffer_full <= 0) ||
913 (master_writable && in_buffer_full > 0) ||
914 (master_readable && out_buffer_full <= 0) ||
915 (stdout_writable && out_buffer_full > 0)) {
917 if (stdin_readable && in_buffer_full < LINE_MAX) {
919 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
922 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
923 stdin_readable = false;
925 log_error("read(): %m");
930 in_buffer_full += (size_t) k;
933 if (master_writable && in_buffer_full > 0) {
935 k = write(master, in_buffer, in_buffer_full);
938 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
939 master_writable = false;
941 log_error("write(): %m");
947 assert(in_buffer_full >= (size_t) k);
948 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
953 if (master_readable && out_buffer_full < LINE_MAX) {
955 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
958 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
959 master_readable = false;
961 log_error("read(): %m");
966 out_buffer_full += (size_t) k;
969 if (stdout_writable && out_buffer_full > 0) {
971 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
974 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
975 stdout_writable = false;
977 log_error("write(): %m");
983 assert(out_buffer_full >= (size_t) k);
984 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
985 out_buffer_full -= k;
993 close_nointr_nofail(ep);
996 close_nointr_nofail(signal_fd);
1001 int main(int argc, char *argv[]) {
1003 int r = EXIT_FAILURE, k;
1004 char *oldcg = NULL, *newcg = NULL;
1005 char **controller = NULL;
1007 const char *console = NULL;
1008 struct termios saved_attr, raw_attr;
1010 bool saved_attr_valid = false;
1012 int kmsg_socket_pair[2] = { -1, -1 };
1014 log_parse_environment();
1017 r = parse_argv(argc, argv);
1021 if (arg_directory) {
1024 p = path_make_absolute_cwd(arg_directory);
1025 free(arg_directory);
1028 arg_directory = get_current_dir_name();
1030 if (!arg_directory) {
1031 log_error("Failed to determine path");
1035 path_kill_slashes(arg_directory);
1037 if (geteuid() != 0) {
1038 log_error("Need to be root.");
1042 if (sd_booted() <= 0) {
1043 log_error("Not running on a systemd system.");
1047 if (path_equal(arg_directory, "/")) {
1048 log_error("Spawning container on root directory not supported.");
1052 if (is_os_tree(arg_directory) <= 0) {
1053 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1057 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1059 log_error("Failed to determine current cgroup: %s", strerror(-k));
1063 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1064 log_error("Failed to allocate cgroup path.");
1068 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1070 log_error("Failed to create cgroup: %s", strerror(-k));
1074 STRV_FOREACH(controller, arg_controllers) {
1075 k = cg_create_and_attach(*controller, newcg, 0);
1077 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1080 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1082 log_error("Failed to acquire pseudo tty: %m");
1086 console = ptsname(master);
1088 log_error("Failed to determine tty name: %m");
1092 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1094 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1095 ioctl(master, TIOCSWINSZ, &ws);
1097 if (unlockpt(master) < 0) {
1098 log_error("Failed to unlock tty: %m");
1102 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1103 log_error("Failed to get terminal attributes: %m");
1107 saved_attr_valid = true;
1109 raw_attr = saved_attr;
1110 cfmakeraw(&raw_attr);
1111 raw_attr.c_lflag &= ~ECHO;
1113 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1114 log_error("Failed to set terminal attributes: %m");
1118 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1119 log_error("Failed to create kmsg socket pair");
1123 assert_se(sigemptyset(&mask) == 0);
1124 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1125 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1127 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1129 if (errno == EINVAL)
1130 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1132 log_error("clone() failed: %m");
1140 const char *home = NULL;
1141 uid_t uid = (uid_t) -1;
1142 gid_t gid = (gid_t) -1;
1143 const char *envp[] = {
1144 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1145 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1150 NULL, /* container_uuid */
1154 envp[2] = strv_find_prefix(environ, "TERM=");
1156 close_nointr_nofail(master);
1158 close_nointr(STDIN_FILENO);
1159 close_nointr(STDOUT_FILENO);
1160 close_nointr(STDERR_FILENO);
1162 close_all_fds(&kmsg_socket_pair[1], 1);
1164 reset_all_signal_handlers();
1166 assert_se(sigemptyset(&mask) == 0);
1167 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1169 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1170 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1171 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1175 log_error("setsid() failed: %m");
1179 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1180 log_error("PR_SET_PDEATHSIG failed: %m");
1184 /* Mark everything as slave, so that we still
1185 * receive mounts from the real root, but don't
1186 * propagate mounts to the real root. */
1187 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1188 log_error("MS_SLAVE|MS_REC failed: %m");
1192 /* Turn directory into bind mount */
1193 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1194 log_error("Failed to make bind mount.");
1199 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1200 log_error("Failed to make read-only.");
1204 if (mount_all(arg_directory) < 0)
1207 if (copy_devnodes(arg_directory) < 0)
1210 dev_setup(arg_directory);
1212 if (setup_dev_console(arg_directory, console) < 0)
1215 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1218 close_nointr_nofail(kmsg_socket_pair[1]);
1220 if (setup_timezone(arg_directory) < 0)
1223 if (setup_resolv_conf(arg_directory) < 0)
1226 if (setup_journal(arg_directory) < 0)
1229 if (chdir(arg_directory) < 0) {
1230 log_error("chdir(%s) failed: %m", arg_directory);
1234 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1235 log_error("mount(MS_MOVE) failed: %m");
1239 if (chroot(".") < 0) {
1240 log_error("chroot() failed: %m");
1244 if (chdir("/") < 0) {
1245 log_error("chdir() failed: %m");
1253 if (drop_capabilities() < 0) {
1254 log_error("drop_capabilities() failed: %m");
1260 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1261 log_error("get_user_creds() failed: %m");
1265 if (mkdir_parents_label(home, 0775) < 0) {
1266 log_error("mkdir_parents_label() failed: %m");
1270 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1271 log_error("mkdir_safe_label() failed: %m");
1275 if (initgroups((const char*)arg_user, gid) < 0) {
1276 log_error("initgroups() failed: %m");
1280 if (setresgid(gid, gid, gid) < 0) {
1281 log_error("setregid() failed: %m");
1285 if (setresuid(uid, uid, uid) < 0) {
1286 log_error("setreuid() failed: %m");
1291 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1292 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1293 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1299 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1311 /* Automatically search for the init system */
1313 l = 1 + argc - optind;
1314 a = newa(char*, l + 1);
1315 memcpy(a + 1, argv + optind, l * sizeof(char*));
1317 a[0] = (char*) "/usr/lib/systemd/systemd";
1318 execve(a[0], a, (char**) envp);
1320 a[0] = (char*) "/lib/systemd/systemd";
1321 execve(a[0], a, (char**) envp);
1323 a[0] = (char*) "/sbin/init";
1324 execve(a[0], a, (char**) envp);
1325 } else if (argc > optind)
1326 execvpe(argv[optind], argv + optind, (char**) envp);
1328 chdir(home ? home : "/root");
1329 execle("/bin/bash", "-bash", NULL, (char**) envp);
1332 log_error("execv() failed: %m");
1335 _exit(EXIT_FAILURE);
1338 if (process_pty(master, &mask) < 0)
1341 if (saved_attr_valid) {
1342 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1343 saved_attr_valid = false;
1346 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1352 if (saved_attr_valid)
1353 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1356 close_nointr_nofail(master);
1358 close_pipe(kmsg_socket_pair);
1361 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1364 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1366 free(arg_directory);
1367 strv_free(arg_controllers);