1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
56 #include "dev-setup.h"
58 typedef enum LinkJournal {
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
75 (1ULL << CAP_DAC_OVERRIDE) |
76 (1ULL << CAP_DAC_READ_SEARCH) |
77 (1ULL << CAP_FOWNER) |
78 (1ULL << CAP_FSETID) |
79 (1ULL << CAP_IPC_OWNER) |
82 (1ULL << CAP_LINUX_IMMUTABLE) |
83 (1ULL << CAP_NET_BIND_SERVICE) |
84 (1ULL << CAP_NET_BROADCAST) |
85 (1ULL << CAP_NET_RAW) |
86 (1ULL << CAP_SETGID) |
87 (1ULL << CAP_SETFCAP) |
88 (1ULL << CAP_SETPCAP) |
89 (1ULL << CAP_SETUID) |
90 (1ULL << CAP_SYS_ADMIN) |
91 (1ULL << CAP_SYS_CHROOT) |
92 (1ULL << CAP_SYS_NICE) |
93 (1ULL << CAP_SYS_PTRACE) |
94 (1ULL << CAP_SYS_TTY_CONFIG) |
95 (1ULL << CAP_SYS_RESOURCE);
97 static int help(void) {
99 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101 " -h --help Show this help\n"
102 " -D --directory=NAME Root directory for the container\n"
103 " -b --boot Boot up full system (i.e. invoke init)\n"
104 " -u --user=USER Run the command under specified user or uid\n"
105 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
106 " --uuid=UUID Set a specific machine UUID for the container\n"
107 " --private-network Disable network in container\n"
108 " --read-only Mount the root directory read-only\n"
109 " --capability=CAP In addition to the default, retain specified capability\n"
110 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
111 " -j Equivalent to --link-journal=host\n",
112 program_invocation_short_name);
117 static int parse_argv(int argc, char *argv[]) {
120 ARG_PRIVATE_NETWORK = 0x100,
127 static const struct option options[] = {
128 { "help", no_argument, NULL, 'h' },
129 { "directory", required_argument, NULL, 'D' },
130 { "user", required_argument, NULL, 'u' },
131 { "controllers", required_argument, NULL, 'C' },
132 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
133 { "boot", no_argument, NULL, 'b' },
134 { "uuid", required_argument, NULL, ARG_UUID },
135 { "read-only", no_argument, NULL, ARG_READ_ONLY },
136 { "capability", required_argument, NULL, ARG_CAPABILITY },
137 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
146 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
156 arg_directory = canonicalize_file_name(optarg);
157 if (!arg_directory) {
158 log_error("Failed to canonicalize root directory.");
166 if (!(arg_user = strdup(optarg))) {
167 log_error("Failed to duplicate user name.");
174 strv_free(arg_controllers);
175 arg_controllers = strv_split(optarg, ",");
176 if (!arg_controllers) {
177 log_error("Failed to split controllers list.");
180 strv_uniq(arg_controllers);
184 case ARG_PRIVATE_NETWORK:
185 arg_private_network = true;
197 arg_read_only = true;
200 case ARG_CAPABILITY: {
204 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
208 t = strndup(word, length);
212 if (cap_from_name(t, &cap) < 0) {
213 log_error("Failed to parse capability %s.", t);
219 arg_retain |= 1ULL << (uint64_t) cap;
226 arg_link_journal = LINK_GUEST;
229 case ARG_LINK_JOURNAL:
230 if (streq(optarg, "auto"))
231 arg_link_journal = LINK_AUTO;
232 else if (streq(optarg, "no"))
233 arg_link_journal = LINK_NO;
234 else if (streq(optarg, "guest"))
235 arg_link_journal = LINK_GUEST;
236 else if (streq(optarg, "host"))
237 arg_link_journal = LINK_HOST;
239 log_error("Failed to parse link journal mode %s", optarg);
249 log_error("Unknown option code %c", c);
257 static int mount_all(const char *dest) {
259 typedef struct MountPoint {
268 static const MountPoint mount_table[] = {
269 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
270 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
271 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
272 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
273 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
274 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
275 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
277 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
278 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
286 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
298 t = path_is_mount_point(where, true);
300 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
309 /* Skip this entry if it is not a remount. */
310 if (mount_table[k].what && t > 0)
313 mkdir_p_label(where, 0755);
315 if (mount(mount_table[k].what,
318 mount_table[k].flags,
319 mount_table[k].options) < 0 &&
320 mount_table[k].fatal) {
322 log_error("mount(%s) failed: %m", where);
334 static int setup_timezone(const char *dest) {
339 /* Fix the timezone, if possible */
340 where = strappend(dest, "/etc/localtime");
344 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
349 where = strappend(dest, "/etc/timezone");
353 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
361 static int setup_resolv_conf(const char *dest) {
366 if (arg_private_network)
369 /* Fix resolv.conf, if possible */
370 where = strappend(dest, "/etc/resolv.conf");
374 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
382 static int setup_boot_id(const char *dest) {
383 char *from = NULL, *to = NULL;
390 /* Generate a new randomized boot ID, so that each boot-up of
391 * the container gets a new one */
393 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
399 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
405 r = sd_id128_randomize(&rnd);
407 log_error("Failed to generate random boot id: %s", strerror(-r));
411 snprintf(as_uuid, sizeof(as_uuid),
412 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
413 SD_ID128_FORMAT_VAL(rnd));
414 char_array_0(as_uuid);
416 r = write_one_line_file(from, as_uuid);
418 log_error("Failed to write boot id: %s", strerror(-r));
422 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
423 log_error("Failed to bind mount boot id: %m");
426 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
437 static int copy_devnodes(const char *dest) {
439 static const char devnodes[] =
457 NULSTR_FOREACH(d, devnodes) {
459 char *from = NULL, *to = NULL;
461 asprintf(&from, "/dev/%s", d);
462 asprintf(&to, "%s/dev/%s", dest, d);
465 log_error("Failed to allocate devnode path");
478 if (stat(from, &st) < 0) {
480 if (errno != ENOENT) {
481 log_error("Failed to stat %s: %m", from);
486 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
488 log_error("%s is not a char or block device, cannot copy.", from);
492 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
494 log_error("mknod(%s) failed: %m", dest);
508 static int setup_dev_console(const char *dest, const char *console) {
519 if (stat(console, &st) < 0) {
520 log_error("Failed to stat %s: %m", console);
524 } else if (!S_ISCHR(st.st_mode)) {
525 log_error("/dev/console is not a char device.");
530 r = chmod_and_chown(console, 0600, 0, 0);
532 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
536 if (asprintf(&to, "%s/dev/console", dest) < 0) {
541 /* We need to bind mount the right tty to /dev/console since
542 * ptys can only exist on pts file systems. To have something
543 * to bind mount things on we create a device node first, that
544 * has the right major/minor (note that the major minor
545 * doesn't actually matter here, since we mount it over
548 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
549 log_error("mknod() for /dev/console failed: %m");
554 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
555 log_error("Bind mount for /dev/console failed: %m");
567 static int setup_kmsg(const char *dest, int kmsg_socket) {
568 char *from = NULL, *to = NULL;
572 struct cmsghdr cmsghdr;
573 uint8_t buf[CMSG_SPACE(sizeof(int))];
576 struct cmsghdr *cmsg;
579 assert(kmsg_socket >= 0);
583 /* We create the kmsg FIFO as /dev/kmsg, but immediately
584 * delete it after bind mounting it to /proc/kmsg. While FIFOs
585 * on the reading side behave very similar to /proc/kmsg,
586 * their writing side behaves differently from /dev/kmsg in
587 * that writing blocks when nothing is reading. In order to
588 * avoid any problems with containers deadlocking due to this
589 * we simply make /dev/kmsg unavailable to the container. */
590 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
595 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
600 if (mkfifo(from, 0600) < 0) {
601 log_error("mkfifo() for /dev/kmsg failed: %m");
606 r = chmod_and_chown(from, 0600, 0, 0);
608 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
612 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
613 log_error("Bind mount for /proc/kmsg failed: %m");
618 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
620 log_error("Failed to open fifo: %m");
628 mh.msg_control = &control;
629 mh.msg_controllen = sizeof(control);
631 cmsg = CMSG_FIRSTHDR(&mh);
632 cmsg->cmsg_level = SOL_SOCKET;
633 cmsg->cmsg_type = SCM_RIGHTS;
634 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
635 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
637 mh.msg_controllen = cmsg->cmsg_len;
639 /* Store away the fd in the socket, so that it stays open as
640 * long as we run the child */
641 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
642 close_nointr_nofail(fd);
645 log_error("Failed to send FIFO fd: %m");
650 /* And now make the FIFO unavailable as /dev/kmsg... */
661 static int setup_hostname(void) {
665 hn = path_get_file_name(arg_directory);
671 hostname_cleanup(hn);
674 if (sethostname(hn, strlen(hn)) < 0)
683 static int setup_journal(const char *directory) {
684 sd_id128_t machine_id;
685 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
688 if (arg_link_journal == LINK_NO)
691 p = strappend(directory, "/etc/machine-id");
697 r = read_one_line_file(p, &b);
698 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
702 log_error("Failed to read machine ID: %s", strerror(-r));
707 if (isempty(l) && arg_link_journal == LINK_AUTO) {
712 /* Verify validaty */
713 r = sd_id128_from_string(l, &machine_id);
715 log_error("Failed to parse machine ID: %s", strerror(-r));
720 p = strappend("/var/log/journal/", l);
721 q = strjoin(directory, "/var/log/journal/", l, NULL);
727 if (path_is_mount_point(p, false) > 0 ||
728 path_is_mount_point(q, false) > 0) {
729 if (arg_link_journal != LINK_AUTO) {
730 log_error("Journal already a mount point, refusing.");
739 r = readlink_and_make_absolute(p, &d);
741 if ((arg_link_journal == LINK_GUEST ||
742 arg_link_journal == LINK_AUTO) &&
752 log_error("Failed to remove symlink %s: %m", p);
756 } else if (r == -EINVAL) {
758 if (arg_link_journal == LINK_GUEST &&
761 if (errno == ENOTDIR)
762 log_error("%s already exists and is neither symlink nor directory.", p);
764 log_error("Failed to remove %s: %m", p);
770 } else if (r != -ENOENT) {
771 log_error("readlink(%s) failed: %m", p);
775 if (arg_link_journal == LINK_GUEST) {
777 if (symlink(q, p) < 0) {
778 log_error("Failed to symlink %s to %s: %m", q, p);
789 if (arg_link_journal == LINK_HOST) {
790 r = mkdir_p(p, 0755);
792 log_error("Failed to create %s: %m", p);
796 } else if (access(p, F_OK) < 0) {
801 if (dir_is_empty(q) == 0) {
802 log_error("%s not empty.", q);
807 r = mkdir_p(q, 0755);
809 log_error("Failed to create %s: %m", q);
813 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
814 log_error("Failed to bind mount journal from host into guest: %m");
830 static int drop_capabilities(void) {
831 return capability_bounding_set_drop(~arg_retain, false);
834 static int is_os_tree(const char *path) {
837 /* We use /bin/sh as flag file if something is an OS */
839 if (asprintf(&p, "%s/bin/sh", path) < 0)
845 return r < 0 ? 0 : 1;
848 static int process_pty(int master, sigset_t *mask) {
850 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
851 size_t in_buffer_full = 0, out_buffer_full = 0;
852 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
853 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
854 int ep = -1, signal_fd = -1, r;
856 fd_nonblock(STDIN_FILENO, 1);
857 fd_nonblock(STDOUT_FILENO, 1);
858 fd_nonblock(master, 1);
860 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
862 log_error("signalfd(): %m");
867 ep = epoll_create1(EPOLL_CLOEXEC);
869 log_error("Failed to create epoll: %m");
875 stdin_ev.events = EPOLLIN|EPOLLET;
876 stdin_ev.data.fd = STDIN_FILENO;
879 stdout_ev.events = EPOLLOUT|EPOLLET;
880 stdout_ev.data.fd = STDOUT_FILENO;
883 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
884 master_ev.data.fd = master;
887 signal_ev.events = EPOLLIN;
888 signal_ev.data.fd = signal_fd;
890 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
891 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
892 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
893 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
894 log_error("Failed to regiser fds in epoll: %m");
900 struct epoll_event ev[16];
904 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
907 if (errno == EINTR || errno == EAGAIN)
910 log_error("epoll_wait(): %m");
917 for (i = 0; i < nfds; i++) {
918 if (ev[i].data.fd == STDIN_FILENO) {
920 if (ev[i].events & (EPOLLIN|EPOLLHUP))
921 stdin_readable = true;
923 } else if (ev[i].data.fd == STDOUT_FILENO) {
925 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
926 stdout_writable = true;
928 } else if (ev[i].data.fd == master) {
930 if (ev[i].events & (EPOLLIN|EPOLLHUP))
931 master_readable = true;
933 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
934 master_writable = true;
936 } else if (ev[i].data.fd == signal_fd) {
937 struct signalfd_siginfo sfsi;
940 n = read(signal_fd, &sfsi, sizeof(sfsi));
941 if (n != sizeof(sfsi)) {
944 log_error("Failed to read from signalfd: invalid block size");
949 if (errno != EINTR && errno != EAGAIN) {
950 log_error("Failed to read from signalfd: %m");
956 if (sfsi.ssi_signo == SIGWINCH) {
959 /* The window size changed, let's forward that. */
960 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
961 ioctl(master, TIOCSWINSZ, &ws);
970 while ((stdin_readable && in_buffer_full <= 0) ||
971 (master_writable && in_buffer_full > 0) ||
972 (master_readable && out_buffer_full <= 0) ||
973 (stdout_writable && out_buffer_full > 0)) {
975 if (stdin_readable && in_buffer_full < LINE_MAX) {
977 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
980 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
981 stdin_readable = false;
983 log_error("read(): %m");
988 in_buffer_full += (size_t) k;
991 if (master_writable && in_buffer_full > 0) {
993 k = write(master, in_buffer, in_buffer_full);
996 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
997 master_writable = false;
999 log_error("write(): %m");
1005 assert(in_buffer_full >= (size_t) k);
1006 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1007 in_buffer_full -= k;
1011 if (master_readable && out_buffer_full < LINE_MAX) {
1013 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1016 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1017 master_readable = false;
1019 log_error("read(): %m");
1024 out_buffer_full += (size_t) k;
1027 if (stdout_writable && out_buffer_full > 0) {
1029 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1032 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1033 stdout_writable = false;
1035 log_error("write(): %m");
1041 assert(out_buffer_full >= (size_t) k);
1042 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1043 out_buffer_full -= k;
1051 close_nointr_nofail(ep);
1054 close_nointr_nofail(signal_fd);
1059 int main(int argc, char *argv[]) {
1061 int r = EXIT_FAILURE, k;
1062 char *oldcg = NULL, *newcg = NULL;
1063 char **controller = NULL;
1065 const char *console = NULL;
1066 struct termios saved_attr, raw_attr;
1068 bool saved_attr_valid = false;
1070 int kmsg_socket_pair[2] = { -1, -1 };
1072 log_parse_environment();
1075 r = parse_argv(argc, argv);
1079 if (arg_directory) {
1082 p = path_make_absolute_cwd(arg_directory);
1083 free(arg_directory);
1086 arg_directory = get_current_dir_name();
1088 if (!arg_directory) {
1089 log_error("Failed to determine path");
1093 path_kill_slashes(arg_directory);
1095 if (geteuid() != 0) {
1096 log_error("Need to be root.");
1100 if (sd_booted() <= 0) {
1101 log_error("Not running on a systemd system.");
1105 if (path_equal(arg_directory, "/")) {
1106 log_error("Spawning container on root directory not supported.");
1110 if (is_os_tree(arg_directory) <= 0) {
1111 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1115 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1117 log_error("Failed to determine current cgroup: %s", strerror(-k));
1121 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1122 log_error("Failed to allocate cgroup path.");
1126 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1128 log_error("Failed to create cgroup: %s", strerror(-k));
1132 STRV_FOREACH(controller, arg_controllers) {
1133 k = cg_create_and_attach(*controller, newcg, 0);
1135 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1138 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1140 log_error("Failed to acquire pseudo tty: %m");
1144 console = ptsname(master);
1146 log_error("Failed to determine tty name: %m");
1150 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1152 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1153 ioctl(master, TIOCSWINSZ, &ws);
1155 if (unlockpt(master) < 0) {
1156 log_error("Failed to unlock tty: %m");
1160 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1161 log_error("Failed to get terminal attributes: %m");
1165 saved_attr_valid = true;
1167 raw_attr = saved_attr;
1168 cfmakeraw(&raw_attr);
1169 raw_attr.c_lflag &= ~ECHO;
1171 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1172 log_error("Failed to set terminal attributes: %m");
1176 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1177 log_error("Failed to create kmsg socket pair");
1181 assert_se(sigemptyset(&mask) == 0);
1182 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1183 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1185 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1187 if (errno == EINVAL)
1188 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1190 log_error("clone() failed: %m");
1198 const char *home = NULL;
1199 uid_t uid = (uid_t) -1;
1200 gid_t gid = (gid_t) -1;
1201 const char *envp[] = {
1202 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1203 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1208 NULL, /* container_uuid */
1212 envp[2] = strv_find_prefix(environ, "TERM=");
1214 close_nointr_nofail(master);
1216 close_nointr(STDIN_FILENO);
1217 close_nointr(STDOUT_FILENO);
1218 close_nointr(STDERR_FILENO);
1220 close_all_fds(&kmsg_socket_pair[1], 1);
1222 reset_all_signal_handlers();
1224 assert_se(sigemptyset(&mask) == 0);
1225 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1227 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1228 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1229 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1233 log_error("setsid() failed: %m");
1237 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1238 log_error("PR_SET_PDEATHSIG failed: %m");
1242 /* Mark everything as slave, so that we still
1243 * receive mounts from the real root, but don't
1244 * propagate mounts to the real root. */
1245 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1246 log_error("MS_SLAVE|MS_REC failed: %m");
1250 /* Turn directory into bind mount */
1251 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1252 log_error("Failed to make bind mount.");
1257 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1258 log_error("Failed to make read-only.");
1262 if (mount_all(arg_directory) < 0)
1265 if (copy_devnodes(arg_directory) < 0)
1268 dev_setup(arg_directory);
1270 if (setup_dev_console(arg_directory, console) < 0)
1273 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1276 close_nointr_nofail(kmsg_socket_pair[1]);
1278 if (setup_boot_id(arg_directory) < 0)
1281 if (setup_timezone(arg_directory) < 0)
1284 if (setup_resolv_conf(arg_directory) < 0)
1287 if (setup_journal(arg_directory) < 0)
1290 if (chdir(arg_directory) < 0) {
1291 log_error("chdir(%s) failed: %m", arg_directory);
1295 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1296 log_error("mount(MS_MOVE) failed: %m");
1300 if (chroot(".") < 0) {
1301 log_error("chroot() failed: %m");
1305 if (chdir("/") < 0) {
1306 log_error("chdir() failed: %m");
1314 if (drop_capabilities() < 0) {
1315 log_error("drop_capabilities() failed: %m");
1321 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1322 log_error("get_user_creds() failed: %m");
1326 if (mkdir_parents_label(home, 0775) < 0) {
1327 log_error("mkdir_parents_label() failed: %m");
1331 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1332 log_error("mkdir_safe_label() failed: %m");
1336 if (initgroups((const char*)arg_user, gid) < 0) {
1337 log_error("initgroups() failed: %m");
1341 if (setresgid(gid, gid, gid) < 0) {
1342 log_error("setregid() failed: %m");
1346 if (setresuid(uid, uid, uid) < 0) {
1347 log_error("setreuid() failed: %m");
1352 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1353 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1354 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1360 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1372 /* Automatically search for the init system */
1374 l = 1 + argc - optind;
1375 a = newa(char*, l + 1);
1376 memcpy(a + 1, argv + optind, l * sizeof(char*));
1378 a[0] = (char*) "/usr/lib/systemd/systemd";
1379 execve(a[0], a, (char**) envp);
1381 a[0] = (char*) "/lib/systemd/systemd";
1382 execve(a[0], a, (char**) envp);
1384 a[0] = (char*) "/sbin/init";
1385 execve(a[0], a, (char**) envp);
1386 } else if (argc > optind)
1387 execvpe(argv[optind], argv + optind, (char**) envp);
1389 chdir(home ? home : "/root");
1390 execle("/bin/bash", "-bash", NULL, (char**) envp);
1393 log_error("execv() failed: %m");
1396 _exit(EXIT_FAILURE);
1399 if (process_pty(master, &mask) < 0)
1402 if (saved_attr_valid) {
1403 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1404 saved_attr_valid = false;
1407 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1413 if (saved_attr_valid)
1414 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1417 close_nointr_nofail(master);
1419 close_pipe(kmsg_socket_pair);
1422 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1425 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1427 free(arg_directory);
1428 strv_free(arg_controllers);