1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
57 typedef enum LinkJournal {
64 static char *arg_directory = NULL;
65 static char *arg_user = NULL;
66 static char **arg_controllers = NULL;
67 static char *arg_uuid = NULL;
68 static bool arg_private_network = false;
69 static bool arg_read_only = false;
70 static bool arg_boot = false;
71 static LinkJournal arg_link_journal = LINK_AUTO;
72 static uint64_t arg_retain =
74 (1ULL << CAP_DAC_OVERRIDE) |
75 (1ULL << CAP_DAC_READ_SEARCH) |
76 (1ULL << CAP_FOWNER) |
77 (1ULL << CAP_FSETID) |
78 (1ULL << CAP_IPC_OWNER) |
81 (1ULL << CAP_LINUX_IMMUTABLE) |
82 (1ULL << CAP_NET_BIND_SERVICE) |
83 (1ULL << CAP_NET_BROADCAST) |
84 (1ULL << CAP_NET_RAW) |
85 (1ULL << CAP_SETGID) |
86 (1ULL << CAP_SETFCAP) |
87 (1ULL << CAP_SETPCAP) |
88 (1ULL << CAP_SETUID) |
89 (1ULL << CAP_SYS_ADMIN) |
90 (1ULL << CAP_SYS_CHROOT) |
91 (1ULL << CAP_SYS_NICE) |
92 (1ULL << CAP_SYS_PTRACE) |
93 (1ULL << CAP_SYS_TTY_CONFIG) |
94 (1ULL << CAP_SYS_RESOURCE);
96 static int help(void) {
98 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
99 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
100 " -h --help Show this help\n"
101 " -D --directory=NAME Root directory for the container\n"
102 " -b --boot Boot up full system (i.e. invoke init)\n"
103 " -u --user=USER Run the command under specified user or uid\n"
104 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
105 " --uuid=UUID Set a specific machine UUID for the container\n"
106 " --private-network Disable network in container\n"
107 " --read-only Mount the root directory read-only\n"
108 " --capability=CAP In addition to the default, retain specified capability\n"
109 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
110 " -j Equivalent to --link-journal=host\n",
111 program_invocation_short_name);
116 static int parse_argv(int argc, char *argv[]) {
119 ARG_PRIVATE_NETWORK = 0x100,
126 static const struct option options[] = {
127 { "help", no_argument, NULL, 'h' },
128 { "directory", required_argument, NULL, 'D' },
129 { "user", required_argument, NULL, 'u' },
130 { "controllers", required_argument, NULL, 'C' },
131 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
132 { "boot", no_argument, NULL, 'b' },
133 { "uuid", required_argument, NULL, ARG_UUID },
134 { "read-only", no_argument, NULL, ARG_READ_ONLY },
135 { "capability", required_argument, NULL, ARG_CAPABILITY },
136 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
145 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
155 arg_directory = canonicalize_file_name(optarg);
156 if (!arg_directory) {
157 log_error("Failed to canonicalize root directory.");
165 if (!(arg_user = strdup(optarg))) {
166 log_error("Failed to duplicate user name.");
173 strv_free(arg_controllers);
174 arg_controllers = strv_split(optarg, ",");
175 if (!arg_controllers) {
176 log_error("Failed to split controllers list.");
179 strv_uniq(arg_controllers);
183 case ARG_PRIVATE_NETWORK:
184 arg_private_network = true;
196 arg_read_only = true;
199 case ARG_CAPABILITY: {
203 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
207 t = strndup(word, length);
209 log_error("Out of memory.");
213 if (cap_from_name(t, &cap) < 0) {
214 log_error("Failed to parse capability %s.", t);
220 arg_retain |= 1ULL << (uint64_t) cap;
227 arg_link_journal = LINK_GUEST;
230 case ARG_LINK_JOURNAL:
231 if (streq(optarg, "auto"))
232 arg_link_journal = LINK_AUTO;
233 else if (streq(optarg, "no"))
234 arg_link_journal = LINK_NO;
235 else if (streq(optarg, "guest"))
236 arg_link_journal = LINK_GUEST;
237 else if (streq(optarg, "host"))
238 arg_link_journal = LINK_HOST;
240 log_error("Failed to parse link journal mode %s", optarg);
250 log_error("Unknown option code %c", c);
258 static int mount_all(const char *dest) {
260 typedef struct MountPoint {
269 static const MountPoint mount_table[] = {
270 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
271 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
272 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
273 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
274 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
277 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
279 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
280 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
288 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
291 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
292 log_error("Out of memory");
300 t = path_is_mount_point(where, false);
302 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
311 mkdir_p_label(where, 0755);
313 if (mount(mount_table[k].what,
316 mount_table[k].flags,
317 mount_table[k].options) < 0 &&
318 mount_table[k].fatal) {
320 log_error("mount(%s) failed: %m", where);
332 static int setup_timezone(const char *dest) {
337 /* Fix the timezone, if possible */
338 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
339 log_error("Out of memory");
343 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
344 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
348 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
349 log_error("Out of memory");
353 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
361 static int setup_resolv_conf(const char *dest) {
366 if (arg_private_network)
369 /* Fix resolv.conf, if possible */
370 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
371 log_error("Out of memory");
375 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
376 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
383 static int copy_devnodes(const char *dest) {
385 static const char devnodes[] =
403 NULSTR_FOREACH(d, devnodes) {
405 char *from = NULL, *to = NULL;
407 asprintf(&from, "/dev/%s", d);
408 asprintf(&to, "%s/dev/%s", dest, d);
411 log_error("Failed to allocate devnode path");
424 if (stat(from, &st) < 0) {
426 if (errno != ENOENT) {
427 log_error("Failed to stat %s: %m", from);
432 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
434 log_error("%s is not a char or block device, cannot copy.", from);
438 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
440 log_error("mknod(%s) failed: %m", dest);
454 static int setup_dev_console(const char *dest, const char *console) {
465 if (stat(console, &st) < 0) {
466 log_error("Failed to stat %s: %m", console);
470 } else if (!S_ISCHR(st.st_mode)) {
471 log_error("/dev/console is not a char device.");
476 r = chmod_and_chown(console, 0600, 0, 0);
478 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
482 if (asprintf(&to, "%s/dev/console", dest) < 0) {
483 log_error("Out of memory");
488 /* We need to bind mount the right tty to /dev/console since
489 * ptys can only exist on pts file systems. To have something
490 * to bind mount things on we create a device node first, that
491 * has the right major/minor (note that the major minor
492 * doesn't actually matter here, since we mount it over
495 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
496 log_error("mknod() for /dev/console failed: %m");
501 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
502 log_error("Bind mount for /dev/console failed: %m");
514 static int setup_kmsg(const char *dest, int kmsg_socket) {
515 char *from = NULL, *to = NULL;
519 struct cmsghdr cmsghdr;
520 uint8_t buf[CMSG_SPACE(sizeof(int))];
523 struct cmsghdr *cmsg;
526 assert(kmsg_socket >= 0);
530 /* We create the kmsg FIFO as /dev/kmsg, but immediately
531 * delete it after bind mounting it to /proc/kmsg. While FIFOs
532 * on the reading side behave very similar to /proc/kmsg,
533 * their writing side behaves differently from /dev/kmsg in
534 * that writing blocks when nothing is reading. In order to
535 * avoid any problems with containers deadlocking due to this
536 * we simply make /dev/kmsg unavailable to the container. */
537 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
538 log_error("Out of memory");
543 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
544 log_error("Out of memory");
549 if (mkfifo(from, 0600) < 0) {
550 log_error("mkfifo() for /dev/kmsg failed: %m");
555 r = chmod_and_chown(from, 0600, 0, 0);
557 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
561 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
562 log_error("Bind mount for /proc/kmsg failed: %m");
567 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
569 log_error("Failed to open fifo: %m");
577 mh.msg_control = &control;
578 mh.msg_controllen = sizeof(control);
580 cmsg = CMSG_FIRSTHDR(&mh);
581 cmsg->cmsg_level = SOL_SOCKET;
582 cmsg->cmsg_type = SCM_RIGHTS;
583 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
584 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
586 mh.msg_controllen = cmsg->cmsg_len;
588 /* Store away the fd in the socket, so that it stays open as
589 * long as we run the child */
590 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
591 close_nointr_nofail(fd);
594 log_error("Failed to send FIFO fd: %m");
599 /* And now make the FIFO unavailable as /dev/kmsg... */
610 static int setup_hostname(void) {
614 hn = path_get_file_name(arg_directory);
620 hostname_cleanup(hn);
623 if (sethostname(hn, strlen(hn)) < 0)
632 static int setup_journal(const char *directory) {
633 sd_id128_t machine_id;
634 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
637 if (arg_link_journal == LINK_NO)
640 p = strappend(directory, "/etc/machine-id");
642 log_error("Out of memory");
647 r = read_one_line_file(p, &b);
648 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
652 log_error("Failed to read machine ID: %s", strerror(-r));
657 if (isempty(l) && arg_link_journal == LINK_AUTO) {
662 /* Verify validaty */
663 r = sd_id128_from_string(l, &machine_id);
665 log_error("Failed to parse machine ID: %s", strerror(-r));
670 p = strappend("/var/log/journal/", l);
671 q = strjoin(directory, "/var/log/journal/", l, NULL);
673 log_error("Out of memory");
678 if (path_is_mount_point(p, false) > 0 ||
679 path_is_mount_point(q, false) > 0) {
680 if (arg_link_journal != LINK_AUTO) {
681 log_error("Journal already a mount point, refusing.");
690 r = readlink_and_make_absolute(p, &d);
692 if ((arg_link_journal == LINK_GUEST ||
693 arg_link_journal == LINK_AUTO) &&
703 log_error("Failed to remove symlink %s: %m", p);
707 } else if (r == -EINVAL) {
709 if (arg_link_journal == LINK_GUEST &&
712 if (errno == ENOTDIR)
713 log_error("%s already exists and is neither symlink nor directory.", p);
715 log_error("Failed to remove %s: %m", p);
721 } else if (r != -ENOENT) {
722 log_error("readlink(%s) failed: %m", p);
726 if (arg_link_journal == LINK_GUEST) {
728 if (symlink(q, p) < 0) {
729 log_error("Failed to symlink %s to %s: %m", q, p);
740 if (arg_link_journal == LINK_HOST) {
741 r = mkdir_p(p, 0755);
743 log_error("Failed to create %s: %m", p);
747 } else if (access(p, F_OK) < 0) {
752 if (dir_is_empty(q) == 0) {
753 log_error("%s not empty.", q);
758 r = mkdir_p(q, 0755);
760 log_error("Failed to create %s: %m", q);
764 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
765 log_error("Failed to bind mount journal from host into guest: %m");
781 static int drop_capabilities(void) {
782 return capability_bounding_set_drop(~arg_retain, false);
785 static int is_os_tree(const char *path) {
788 /* We use /bin/sh as flag file if something is an OS */
790 if (asprintf(&p, "%s/bin/sh", path) < 0)
796 return r < 0 ? 0 : 1;
799 static int process_pty(int master, sigset_t *mask) {
801 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
802 size_t in_buffer_full = 0, out_buffer_full = 0;
803 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
804 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
805 int ep = -1, signal_fd = -1, r;
807 fd_nonblock(STDIN_FILENO, 1);
808 fd_nonblock(STDOUT_FILENO, 1);
809 fd_nonblock(master, 1);
811 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
813 log_error("signalfd(): %m");
818 ep = epoll_create1(EPOLL_CLOEXEC);
820 log_error("Failed to create epoll: %m");
826 stdin_ev.events = EPOLLIN|EPOLLET;
827 stdin_ev.data.fd = STDIN_FILENO;
830 stdout_ev.events = EPOLLOUT|EPOLLET;
831 stdout_ev.data.fd = STDOUT_FILENO;
834 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
835 master_ev.data.fd = master;
838 signal_ev.events = EPOLLIN;
839 signal_ev.data.fd = signal_fd;
841 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
842 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
843 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
844 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
845 log_error("Failed to regiser fds in epoll: %m");
851 struct epoll_event ev[16];
855 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
858 if (errno == EINTR || errno == EAGAIN)
861 log_error("epoll_wait(): %m");
868 for (i = 0; i < nfds; i++) {
869 if (ev[i].data.fd == STDIN_FILENO) {
871 if (ev[i].events & (EPOLLIN|EPOLLHUP))
872 stdin_readable = true;
874 } else if (ev[i].data.fd == STDOUT_FILENO) {
876 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
877 stdout_writable = true;
879 } else if (ev[i].data.fd == master) {
881 if (ev[i].events & (EPOLLIN|EPOLLHUP))
882 master_readable = true;
884 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
885 master_writable = true;
887 } else if (ev[i].data.fd == signal_fd) {
888 struct signalfd_siginfo sfsi;
891 n = read(signal_fd, &sfsi, sizeof(sfsi));
892 if (n != sizeof(sfsi)) {
895 log_error("Failed to read from signalfd: invalid block size");
900 if (errno != EINTR && errno != EAGAIN) {
901 log_error("Failed to read from signalfd: %m");
907 if (sfsi.ssi_signo == SIGWINCH) {
910 /* The window size changed, let's forward that. */
911 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
912 ioctl(master, TIOCSWINSZ, &ws);
921 while ((stdin_readable && in_buffer_full <= 0) ||
922 (master_writable && in_buffer_full > 0) ||
923 (master_readable && out_buffer_full <= 0) ||
924 (stdout_writable && out_buffer_full > 0)) {
926 if (stdin_readable && in_buffer_full < LINE_MAX) {
928 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
931 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
932 stdin_readable = false;
934 log_error("read(): %m");
939 in_buffer_full += (size_t) k;
942 if (master_writable && in_buffer_full > 0) {
944 k = write(master, in_buffer, in_buffer_full);
947 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
948 master_writable = false;
950 log_error("write(): %m");
956 assert(in_buffer_full >= (size_t) k);
957 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
962 if (master_readable && out_buffer_full < LINE_MAX) {
964 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
967 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
968 master_readable = false;
970 log_error("read(): %m");
975 out_buffer_full += (size_t) k;
978 if (stdout_writable && out_buffer_full > 0) {
980 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
983 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
984 stdout_writable = false;
986 log_error("write(): %m");
992 assert(out_buffer_full >= (size_t) k);
993 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
994 out_buffer_full -= k;
1002 close_nointr_nofail(ep);
1005 close_nointr_nofail(signal_fd);
1010 int main(int argc, char *argv[]) {
1012 int r = EXIT_FAILURE, k;
1013 char *oldcg = NULL, *newcg = NULL;
1014 char **controller = NULL;
1016 const char *console = NULL;
1017 struct termios saved_attr, raw_attr;
1019 bool saved_attr_valid = false;
1021 int kmsg_socket_pair[2] = { -1, -1 };
1023 log_parse_environment();
1026 r = parse_argv(argc, argv);
1030 if (arg_directory) {
1033 p = path_make_absolute_cwd(arg_directory);
1034 free(arg_directory);
1037 arg_directory = get_current_dir_name();
1039 if (!arg_directory) {
1040 log_error("Failed to determine path");
1044 path_kill_slashes(arg_directory);
1046 if (geteuid() != 0) {
1047 log_error("Need to be root.");
1051 if (sd_booted() <= 0) {
1052 log_error("Not running on a systemd system.");
1056 if (path_equal(arg_directory, "/")) {
1057 log_error("Spawning container on root directory not supported.");
1061 if (is_os_tree(arg_directory) <= 0) {
1062 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1066 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1068 log_error("Failed to determine current cgroup: %s", strerror(-k));
1072 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1073 log_error("Failed to allocate cgroup path.");
1077 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1079 log_error("Failed to create cgroup: %s", strerror(-k));
1083 STRV_FOREACH(controller, arg_controllers) {
1084 k = cg_create_and_attach(*controller, newcg, 0);
1086 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1089 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1091 log_error("Failed to acquire pseudo tty: %m");
1095 console = ptsname(master);
1097 log_error("Failed to determine tty name: %m");
1101 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1103 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1104 ioctl(master, TIOCSWINSZ, &ws);
1106 if (unlockpt(master) < 0) {
1107 log_error("Failed to unlock tty: %m");
1111 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1112 log_error("Failed to get terminal attributes: %m");
1116 saved_attr_valid = true;
1118 raw_attr = saved_attr;
1119 cfmakeraw(&raw_attr);
1120 raw_attr.c_lflag &= ~ECHO;
1122 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1123 log_error("Failed to set terminal attributes: %m");
1127 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1128 log_error("Failed to create kmsg socket pair");
1132 assert_se(sigemptyset(&mask) == 0);
1133 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1134 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1136 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1138 if (errno == EINVAL)
1139 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1141 log_error("clone() failed: %m");
1149 const char *home = NULL;
1150 uid_t uid = (uid_t) -1;
1151 gid_t gid = (gid_t) -1;
1152 const char *envp[] = {
1153 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1154 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1159 NULL, /* container_uuid */
1163 envp[2] = strv_find_prefix(environ, "TERM=");
1165 close_nointr_nofail(master);
1167 close_nointr(STDIN_FILENO);
1168 close_nointr(STDOUT_FILENO);
1169 close_nointr(STDERR_FILENO);
1171 close_all_fds(&kmsg_socket_pair[1], 1);
1173 reset_all_signal_handlers();
1175 assert_se(sigemptyset(&mask) == 0);
1176 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1178 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1179 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1180 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1184 log_error("setsid() failed: %m");
1188 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1189 log_error("PR_SET_PDEATHSIG failed: %m");
1193 /* Mark / as private, in case somebody marked it shared */
1194 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0) {
1195 log_error("MS_PRIVATE|MS_REC failed: %m");
1199 /* Turn directory into bind mount */
1200 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
1201 log_error("Failed to make bind mount.");
1206 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
1207 log_error("Failed to make read-only.");
1211 if (mount_all(arg_directory) < 0)
1214 if (copy_devnodes(arg_directory) < 0)
1217 if (setup_dev_console(arg_directory, console) < 0)
1220 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1223 close_nointr_nofail(kmsg_socket_pair[1]);
1225 if (setup_timezone(arg_directory) < 0)
1228 if (setup_resolv_conf(arg_directory) < 0)
1231 if (setup_journal(arg_directory) < 0)
1234 if (chdir(arg_directory) < 0) {
1235 log_error("chdir(%s) failed: %m", arg_directory);
1239 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1240 log_error("mount(MS_BIND) failed: %m");
1244 if (chroot(".") < 0) {
1245 log_error("chroot() failed: %m");
1249 if (chdir("/") < 0) {
1250 log_error("chdir() failed: %m");
1258 if (drop_capabilities() < 0) {
1259 log_error("drop_capabilities() failed: %m");
1265 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1266 log_error("get_user_creds() failed: %m");
1270 if (mkdir_parents_label(home, 0775) < 0) {
1271 log_error("mkdir_parents_label() failed: %m");
1275 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1276 log_error("mkdir_safe_label() failed: %m");
1280 if (initgroups((const char*)arg_user, gid) < 0) {
1281 log_error("initgroups() failed: %m");
1285 if (setresgid(gid, gid, gid) < 0) {
1286 log_error("setregid() failed: %m");
1290 if (setresuid(uid, uid, uid) < 0) {
1291 log_error("setreuid() failed: %m");
1296 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1297 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1298 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1299 log_error("Out of memory");
1304 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1305 log_error("Out of memory");
1316 /* Automatically search for the init system */
1318 l = 1 + argc - optind;
1319 a = newa(char*, l + 1);
1320 memcpy(a + 1, argv + optind, l * sizeof(char*));
1322 a[0] = (char*) "/usr/lib/systemd/systemd";
1323 execve(a[0], a, (char**) envp);
1325 a[0] = (char*) "/lib/systemd/systemd";
1326 execve(a[0], a, (char**) envp);
1328 a[0] = (char*) "/sbin/init";
1329 execve(a[0], a, (char**) envp);
1330 } else if (argc > optind)
1331 execvpe(argv[optind], argv + optind, (char**) envp);
1333 chdir(home ? home : "/root");
1334 execle("/bin/bash", "-bash", NULL, (char**) envp);
1337 log_error("execv() failed: %m");
1340 _exit(EXIT_FAILURE);
1343 if (process_pty(master, &mask) < 0)
1346 if (saved_attr_valid) {
1347 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1348 saved_attr_valid = false;
1351 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1357 if (saved_attr_valid)
1358 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1361 close_nointr_nofail(master);
1363 close_pipe(kmsg_socket_pair);
1366 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1369 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1371 free(arg_directory);
1372 strv_free(arg_controllers);