1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
59 typedef enum LinkJournal {
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
76 (1ULL << CAP_DAC_OVERRIDE) |
77 (1ULL << CAP_DAC_READ_SEARCH) |
78 (1ULL << CAP_FOWNER) |
79 (1ULL << CAP_FSETID) |
80 (1ULL << CAP_IPC_OWNER) |
83 (1ULL << CAP_LINUX_IMMUTABLE) |
84 (1ULL << CAP_NET_BIND_SERVICE) |
85 (1ULL << CAP_NET_BROADCAST) |
86 (1ULL << CAP_NET_RAW) |
87 (1ULL << CAP_SETGID) |
88 (1ULL << CAP_SETFCAP) |
89 (1ULL << CAP_SETPCAP) |
90 (1ULL << CAP_SETUID) |
91 (1ULL << CAP_SYS_ADMIN) |
92 (1ULL << CAP_SYS_CHROOT) |
93 (1ULL << CAP_SYS_NICE) |
94 (1ULL << CAP_SYS_PTRACE) |
95 (1ULL << CAP_SYS_TTY_CONFIG) |
96 (1ULL << CAP_SYS_RESOURCE) |
97 (1ULL << CAP_SYS_BOOT);
99 static int help(void) {
101 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103 " -h --help Show this help\n"
104 " -D --directory=NAME Root directory for the container\n"
105 " -b --boot Boot up full system (i.e. invoke init)\n"
106 " -u --user=USER Run the command under specified user or uid\n"
107 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
108 " --uuid=UUID Set a specific machine UUID for the container\n"
109 " --private-network Disable network in container\n"
110 " --read-only Mount the root directory read-only\n"
111 " --capability=CAP In addition to the default, retain specified capability\n"
112 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
113 " -j Equivalent to --link-journal=host\n",
114 program_invocation_short_name);
119 static int parse_argv(int argc, char *argv[]) {
122 ARG_PRIVATE_NETWORK = 0x100,
129 static const struct option options[] = {
130 { "help", no_argument, NULL, 'h' },
131 { "directory", required_argument, NULL, 'D' },
132 { "user", required_argument, NULL, 'u' },
133 { "controllers", required_argument, NULL, 'C' },
134 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
135 { "boot", no_argument, NULL, 'b' },
136 { "uuid", required_argument, NULL, ARG_UUID },
137 { "read-only", no_argument, NULL, ARG_READ_ONLY },
138 { "capability", required_argument, NULL, ARG_CAPABILITY },
139 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
148 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158 arg_directory = canonicalize_file_name(optarg);
159 if (!arg_directory) {
160 log_error("Failed to canonicalize root directory.");
168 if (!(arg_user = strdup(optarg))) {
169 log_error("Failed to duplicate user name.");
176 strv_free(arg_controllers);
177 arg_controllers = strv_split(optarg, ",");
178 if (!arg_controllers) {
179 log_error("Failed to split controllers list.");
182 strv_uniq(arg_controllers);
186 case ARG_PRIVATE_NETWORK:
187 arg_private_network = true;
199 arg_read_only = true;
202 case ARG_CAPABILITY: {
206 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
210 t = strndup(word, length);
214 if (cap_from_name(t, &cap) < 0) {
215 log_error("Failed to parse capability %s.", t);
221 arg_retain |= 1ULL << (uint64_t) cap;
228 arg_link_journal = LINK_GUEST;
231 case ARG_LINK_JOURNAL:
232 if (streq(optarg, "auto"))
233 arg_link_journal = LINK_AUTO;
234 else if (streq(optarg, "no"))
235 arg_link_journal = LINK_NO;
236 else if (streq(optarg, "guest"))
237 arg_link_journal = LINK_GUEST;
238 else if (streq(optarg, "host"))
239 arg_link_journal = LINK_HOST;
241 log_error("Failed to parse link journal mode %s", optarg);
251 log_error("Unknown option code %c", c);
259 static int mount_all(const char *dest) {
261 typedef struct MountPoint {
270 static const MountPoint mount_table[] = {
271 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
272 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
277 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
280 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
281 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
288 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 char _cleanup_free_ *where = NULL;
292 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
301 t = path_is_mount_point(where, true);
303 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
315 mkdir_p_label(where, 0755);
317 if (mount(mount_table[k].what,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
324 log_error("mount(%s) failed: %m", where);
334 static int setup_timezone(const char *dest) {
339 /* Fix the timezone, if possible */
340 where = strappend(dest, "/etc/localtime");
344 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
349 where = strappend(dest, "/etc/timezone");
353 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
361 static int setup_resolv_conf(const char *dest) {
366 if (arg_private_network)
369 /* Fix resolv.conf, if possible */
370 where = strappend(dest, "/etc/resolv.conf");
374 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
382 static int setup_boot_id(const char *dest) {
383 char _cleanup_free_ *from = NULL, *to = NULL;
390 /* Generate a new randomized boot ID, so that each boot-up of
391 * the container gets a new one */
393 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
398 r = sd_id128_randomize(&rnd);
400 log_error("Failed to generate random boot id: %s", strerror(-r));
404 snprintf(as_uuid, sizeof(as_uuid),
405 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
406 SD_ID128_FORMAT_VAL(rnd));
407 char_array_0(as_uuid);
409 r = write_one_line_file(from, as_uuid);
411 log_error("Failed to write boot id: %s", strerror(-r));
415 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
416 log_error("Failed to bind mount boot id: %m");
419 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
425 static int copy_devnodes(const char *dest) {
427 static const char devnodes[] =
438 mode_t _cleanup_umask_ u;
444 NULSTR_FOREACH(d, devnodes) {
446 char _cleanup_free_ *from = NULL, *to = NULL;
448 asprintf(&from, "/dev/%s", d);
449 asprintf(&to, "%s/dev/%s", dest, d);
460 if (stat(from, &st) < 0) {
462 if (errno != ENOENT) {
463 log_error("Failed to stat %s: %m", from);
468 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
470 log_error("%s is not a char or block device, cannot copy", from);
474 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
476 log_error("mknod(%s) failed: %m", dest);
485 static int setup_dev_console(const char *dest, const char *console) {
487 char _cleanup_free_ *to = NULL;
489 mode_t _cleanup_umask_ u;
496 if (stat(console, &st) < 0) {
497 log_error("Failed to stat %s: %m", console);
500 } else if (!S_ISCHR(st.st_mode)) {
501 log_error("/dev/console is not a char device");
505 r = chmod_and_chown(console, 0600, 0, 0);
507 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
511 if (asprintf(&to, "%s/dev/console", dest) < 0)
514 /* We need to bind mount the right tty to /dev/console since
515 * ptys can only exist on pts file systems. To have something
516 * to bind mount things on we create a device node first, that
517 * has the right major/minor (note that the major minor
518 * doesn't actually matter here, since we mount it over
521 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
522 log_error("mknod() for /dev/console failed: %m");
526 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
527 log_error("Bind mount for /dev/console failed: %m");
534 static int setup_kmsg(const char *dest, int kmsg_socket) {
535 char _cleanup_free_ *from = NULL, *to = NULL;
537 mode_t _cleanup_umask_ u;
539 struct cmsghdr cmsghdr;
540 uint8_t buf[CMSG_SPACE(sizeof(int))];
543 struct cmsghdr *cmsg;
546 assert(kmsg_socket >= 0);
550 /* We create the kmsg FIFO as /dev/kmsg, but immediately
551 * delete it after bind mounting it to /proc/kmsg. While FIFOs
552 * on the reading side behave very similar to /proc/kmsg,
553 * their writing side behaves differently from /dev/kmsg in
554 * that writing blocks when nothing is reading. In order to
555 * avoid any problems with containers deadlocking due to this
556 * we simply make /dev/kmsg unavailable to the container. */
557 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
558 asprintf(&to, "%s/proc/kmsg", dest) < 0)
561 if (mkfifo(from, 0600) < 0) {
562 log_error("mkfifo() for /dev/kmsg failed: %m");
566 r = chmod_and_chown(from, 0600, 0, 0);
568 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
572 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
573 log_error("Bind mount for /proc/kmsg failed: %m");
577 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
579 log_error("Failed to open fifo: %m");
586 mh.msg_control = &control;
587 mh.msg_controllen = sizeof(control);
589 cmsg = CMSG_FIRSTHDR(&mh);
590 cmsg->cmsg_level = SOL_SOCKET;
591 cmsg->cmsg_type = SCM_RIGHTS;
592 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
593 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
595 mh.msg_controllen = cmsg->cmsg_len;
597 /* Store away the fd in the socket, so that it stays open as
598 * long as we run the child */
599 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
600 close_nointr_nofail(fd);
603 log_error("Failed to send FIFO fd: %m");
607 /* And now make the FIFO unavailable as /dev/kmsg... */
612 static int setup_hostname(void) {
616 hn = path_get_file_name(arg_directory);
622 hostname_cleanup(hn);
625 if (sethostname(hn, strlen(hn)) < 0)
634 static int setup_journal(const char *directory) {
635 sd_id128_t machine_id;
636 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
639 if (arg_link_journal == LINK_NO)
642 p = strappend(directory, "/etc/machine-id");
648 r = read_one_line_file(p, &b);
649 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
653 log_error("Failed to read machine ID: %s", strerror(-r));
658 if (isempty(l) && arg_link_journal == LINK_AUTO) {
663 /* Verify validaty */
664 r = sd_id128_from_string(l, &machine_id);
666 log_error("Failed to parse machine ID: %s", strerror(-r));
671 p = strappend("/var/log/journal/", l);
672 q = strjoin(directory, "/var/log/journal/", l, NULL);
678 if (path_is_mount_point(p, false) > 0 ||
679 path_is_mount_point(q, false) > 0) {
680 if (arg_link_journal != LINK_AUTO) {
681 log_error("Journal already a mount point, refusing.");
690 r = readlink_and_make_absolute(p, &d);
692 if ((arg_link_journal == LINK_GUEST ||
693 arg_link_journal == LINK_AUTO) &&
703 log_error("Failed to remove symlink %s: %m", p);
707 } else if (r == -EINVAL) {
709 if (arg_link_journal == LINK_GUEST &&
712 if (errno == ENOTDIR)
713 log_error("%s already exists and is neither symlink nor directory.", p);
715 log_error("Failed to remove %s: %m", p);
721 } else if (r != -ENOENT) {
722 log_error("readlink(%s) failed: %m", p);
726 if (arg_link_journal == LINK_GUEST) {
728 if (symlink(q, p) < 0) {
729 log_error("Failed to symlink %s to %s: %m", q, p);
740 if (arg_link_journal == LINK_HOST) {
741 r = mkdir_p(p, 0755);
743 log_error("Failed to create %s: %m", p);
747 } else if (access(p, F_OK) < 0) {
752 if (dir_is_empty(q) == 0) {
753 log_error("%s not empty.", q);
758 r = mkdir_p(q, 0755);
760 log_error("Failed to create %s: %m", q);
764 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
765 log_error("Failed to bind mount journal from host into guest: %m");
781 static int drop_capabilities(void) {
782 return capability_bounding_set_drop(~arg_retain, false);
785 static int is_os_tree(const char *path) {
788 /* We use /bin/sh as flag file if something is an OS */
790 if (asprintf(&p, "%s/bin/sh", path) < 0)
796 return r < 0 ? 0 : 1;
799 static int process_pty(int master, sigset_t *mask) {
801 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
802 size_t in_buffer_full = 0, out_buffer_full = 0;
803 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
804 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
805 int ep = -1, signal_fd = -1, r;
807 fd_nonblock(STDIN_FILENO, 1);
808 fd_nonblock(STDOUT_FILENO, 1);
809 fd_nonblock(master, 1);
811 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
813 log_error("signalfd(): %m");
818 ep = epoll_create1(EPOLL_CLOEXEC);
820 log_error("Failed to create epoll: %m");
826 stdin_ev.events = EPOLLIN|EPOLLET;
827 stdin_ev.data.fd = STDIN_FILENO;
830 stdout_ev.events = EPOLLOUT|EPOLLET;
831 stdout_ev.data.fd = STDOUT_FILENO;
834 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
835 master_ev.data.fd = master;
838 signal_ev.events = EPOLLIN;
839 signal_ev.data.fd = signal_fd;
841 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
842 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
843 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
844 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
845 log_error("Failed to regiser fds in epoll: %m");
851 struct epoll_event ev[16];
855 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
858 if (errno == EINTR || errno == EAGAIN)
861 log_error("epoll_wait(): %m");
868 for (i = 0; i < nfds; i++) {
869 if (ev[i].data.fd == STDIN_FILENO) {
871 if (ev[i].events & (EPOLLIN|EPOLLHUP))
872 stdin_readable = true;
874 } else if (ev[i].data.fd == STDOUT_FILENO) {
876 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
877 stdout_writable = true;
879 } else if (ev[i].data.fd == master) {
881 if (ev[i].events & (EPOLLIN|EPOLLHUP))
882 master_readable = true;
884 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
885 master_writable = true;
887 } else if (ev[i].data.fd == signal_fd) {
888 struct signalfd_siginfo sfsi;
891 n = read(signal_fd, &sfsi, sizeof(sfsi));
892 if (n != sizeof(sfsi)) {
895 log_error("Failed to read from signalfd: invalid block size");
900 if (errno != EINTR && errno != EAGAIN) {
901 log_error("Failed to read from signalfd: %m");
907 if (sfsi.ssi_signo == SIGWINCH) {
910 /* The window size changed, let's forward that. */
911 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
912 ioctl(master, TIOCSWINSZ, &ws);
921 while ((stdin_readable && in_buffer_full <= 0) ||
922 (master_writable && in_buffer_full > 0) ||
923 (master_readable && out_buffer_full <= 0) ||
924 (stdout_writable && out_buffer_full > 0)) {
926 if (stdin_readable && in_buffer_full < LINE_MAX) {
928 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
931 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
932 stdin_readable = false;
934 log_error("read(): %m");
939 in_buffer_full += (size_t) k;
942 if (master_writable && in_buffer_full > 0) {
944 k = write(master, in_buffer, in_buffer_full);
947 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
948 master_writable = false;
950 log_error("write(): %m");
956 assert(in_buffer_full >= (size_t) k);
957 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
962 if (master_readable && out_buffer_full < LINE_MAX) {
964 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
967 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
968 master_readable = false;
970 log_error("read(): %m");
975 out_buffer_full += (size_t) k;
978 if (stdout_writable && out_buffer_full > 0) {
980 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
983 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
984 stdout_writable = false;
986 log_error("write(): %m");
992 assert(out_buffer_full >= (size_t) k);
993 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
994 out_buffer_full -= k;
1002 close_nointr_nofail(ep);
1005 close_nointr_nofail(signal_fd);
1010 int main(int argc, char *argv[]) {
1012 int r = EXIT_FAILURE, k;
1013 char *oldcg = NULL, *newcg = NULL;
1014 char **controller = NULL;
1016 const char *console = NULL;
1017 struct termios saved_attr, raw_attr;
1019 bool saved_attr_valid = false;
1021 int kmsg_socket_pair[2] = { -1, -1 };
1023 log_parse_environment();
1026 r = parse_argv(argc, argv);
1030 if (arg_directory) {
1033 p = path_make_absolute_cwd(arg_directory);
1034 free(arg_directory);
1037 arg_directory = get_current_dir_name();
1039 if (!arg_directory) {
1040 log_error("Failed to determine path");
1044 path_kill_slashes(arg_directory);
1046 if (geteuid() != 0) {
1047 log_error("Need to be root.");
1051 if (sd_booted() <= 0) {
1052 log_error("Not running on a systemd system.");
1056 if (path_equal(arg_directory, "/")) {
1057 log_error("Spawning container on root directory not supported.");
1061 if (is_os_tree(arg_directory) <= 0) {
1062 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1066 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1068 log_error("Failed to determine current cgroup: %s", strerror(-k));
1072 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1073 log_error("Failed to allocate cgroup path.");
1077 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1079 log_error("Failed to create cgroup: %s", strerror(-k));
1083 STRV_FOREACH(controller, arg_controllers) {
1084 k = cg_create_and_attach(*controller, newcg, 0);
1086 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1089 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1091 log_error("Failed to acquire pseudo tty: %m");
1095 console = ptsname(master);
1097 log_error("Failed to determine tty name: %m");
1101 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1103 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1104 ioctl(master, TIOCSWINSZ, &ws);
1106 if (unlockpt(master) < 0) {
1107 log_error("Failed to unlock tty: %m");
1111 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1112 log_error("Failed to get terminal attributes: %m");
1116 saved_attr_valid = true;
1118 raw_attr = saved_attr;
1119 cfmakeraw(&raw_attr);
1120 raw_attr.c_lflag &= ~ECHO;
1122 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1123 log_error("Failed to create kmsg socket pair");
1127 assert_se(sigemptyset(&mask) == 0);
1128 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1129 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1134 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1135 log_error("Failed to set terminal attributes: %m");
1139 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1141 if (errno == EINVAL)
1142 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1144 log_error("clone() failed: %m");
1152 const char *home = NULL;
1153 uid_t uid = (uid_t) -1;
1154 gid_t gid = (gid_t) -1;
1155 const char *envp[] = {
1156 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1157 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1162 NULL, /* container_uuid */
1166 envp[2] = strv_find_prefix(environ, "TERM=");
1168 close_nointr_nofail(master);
1170 close_nointr(STDIN_FILENO);
1171 close_nointr(STDOUT_FILENO);
1172 close_nointr(STDERR_FILENO);
1174 close_all_fds(&kmsg_socket_pair[1], 1);
1176 reset_all_signal_handlers();
1178 assert_se(sigemptyset(&mask) == 0);
1179 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1181 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1182 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1183 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1187 log_error("setsid() failed: %m");
1191 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1192 log_error("PR_SET_PDEATHSIG failed: %m");
1196 /* Mark everything as slave, so that we still
1197 * receive mounts from the real root, but don't
1198 * propagate mounts to the real root. */
1199 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1200 log_error("MS_SLAVE|MS_REC failed: %m");
1204 /* Turn directory into bind mount */
1205 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1206 log_error("Failed to make bind mount.");
1211 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1212 log_error("Failed to make read-only.");
1216 if (mount_all(arg_directory) < 0)
1219 if (copy_devnodes(arg_directory) < 0)
1222 dev_setup(arg_directory);
1224 if (setup_dev_console(arg_directory, console) < 0)
1227 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1230 close_nointr_nofail(kmsg_socket_pair[1]);
1232 if (setup_boot_id(arg_directory) < 0)
1235 if (setup_timezone(arg_directory) < 0)
1238 if (setup_resolv_conf(arg_directory) < 0)
1241 if (setup_journal(arg_directory) < 0)
1244 if (chdir(arg_directory) < 0) {
1245 log_error("chdir(%s) failed: %m", arg_directory);
1249 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1250 log_error("mount(MS_MOVE) failed: %m");
1254 if (chroot(".") < 0) {
1255 log_error("chroot() failed: %m");
1259 if (chdir("/") < 0) {
1260 log_error("chdir() failed: %m");
1268 if (drop_capabilities() < 0) {
1269 log_error("drop_capabilities() failed: %m");
1275 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1276 log_error("get_user_creds() failed: %m");
1280 if (mkdir_parents_label(home, 0775) < 0) {
1281 log_error("mkdir_parents_label() failed: %m");
1285 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1286 log_error("mkdir_safe_label() failed: %m");
1290 if (initgroups((const char*)arg_user, gid) < 0) {
1291 log_error("initgroups() failed: %m");
1295 if (setresgid(gid, gid, gid) < 0) {
1296 log_error("setregid() failed: %m");
1300 if (setresuid(uid, uid, uid) < 0) {
1301 log_error("setreuid() failed: %m");
1306 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1307 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1308 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1314 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1326 /* Automatically search for the init system */
1328 l = 1 + argc - optind;
1329 a = newa(char*, l + 1);
1330 memcpy(a + 1, argv + optind, l * sizeof(char*));
1332 a[0] = (char*) "/usr/lib/systemd/systemd";
1333 execve(a[0], a, (char**) envp);
1335 a[0] = (char*) "/lib/systemd/systemd";
1336 execve(a[0], a, (char**) envp);
1338 a[0] = (char*) "/sbin/init";
1339 execve(a[0], a, (char**) envp);
1340 } else if (argc > optind)
1341 execvpe(argv[optind], argv + optind, (char**) envp);
1343 chdir(home ? home : "/root");
1344 execle("/bin/bash", "-bash", NULL, (char**) envp);
1347 log_error("execv() failed: %m");
1350 _exit(EXIT_FAILURE);
1353 if (process_pty(master, &mask) < 0)
1357 if (saved_attr_valid)
1358 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1360 r = wait_for_terminate(pid, &status);
1366 if (status.si_code == CLD_EXITED) {
1367 if (status.si_status != 0) {
1368 log_error("Container failed with error code %i.", status.si_status);
1369 r = status.si_status;
1373 log_debug("Container exited successfully.");
1375 } else if (status.si_code == CLD_KILLED &&
1376 status.si_status == SIGINT) {
1377 log_info("Container has been shut down.");
1380 } else if (status.si_code == CLD_KILLED &&
1381 status.si_status == SIGHUP) {
1382 log_info("Container is being rebooted.");
1384 } else if (status.si_code == CLD_KILLED ||
1385 status.si_code == CLD_DUMPED) {
1387 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1391 log_error("Container failed due to unknown reason.");
1398 if (saved_attr_valid)
1399 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1402 close_nointr_nofail(master);
1404 close_pipe(kmsg_socket_pair);
1407 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1410 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1412 free(arg_directory);
1413 strv_free(arg_controllers);