1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
56 #include "dev-setup.h"
58 typedef enum LinkJournal {
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
75 (1ULL << CAP_DAC_OVERRIDE) |
76 (1ULL << CAP_DAC_READ_SEARCH) |
77 (1ULL << CAP_FOWNER) |
78 (1ULL << CAP_FSETID) |
79 (1ULL << CAP_IPC_OWNER) |
82 (1ULL << CAP_LINUX_IMMUTABLE) |
83 (1ULL << CAP_NET_BIND_SERVICE) |
84 (1ULL << CAP_NET_BROADCAST) |
85 (1ULL << CAP_NET_RAW) |
86 (1ULL << CAP_SETGID) |
87 (1ULL << CAP_SETFCAP) |
88 (1ULL << CAP_SETPCAP) |
89 (1ULL << CAP_SETUID) |
90 (1ULL << CAP_SYS_ADMIN) |
91 (1ULL << CAP_SYS_CHROOT) |
92 (1ULL << CAP_SYS_NICE) |
93 (1ULL << CAP_SYS_PTRACE) |
94 (1ULL << CAP_SYS_TTY_CONFIG) |
95 (1ULL << CAP_SYS_RESOURCE) |
96 (1ULL << CAP_SYS_BOOT);
98 static int help(void) {
100 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
101 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
102 " -h --help Show this help\n"
103 " -D --directory=NAME Root directory for the container\n"
104 " -b --boot Boot up full system (i.e. invoke init)\n"
105 " -u --user=USER Run the command under specified user or uid\n"
106 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
107 " --uuid=UUID Set a specific machine UUID for the container\n"
108 " --private-network Disable network in container\n"
109 " --read-only Mount the root directory read-only\n"
110 " --capability=CAP In addition to the default, retain specified capability\n"
111 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
112 " -j Equivalent to --link-journal=host\n",
113 program_invocation_short_name);
118 static int parse_argv(int argc, char *argv[]) {
121 ARG_PRIVATE_NETWORK = 0x100,
128 static const struct option options[] = {
129 { "help", no_argument, NULL, 'h' },
130 { "directory", required_argument, NULL, 'D' },
131 { "user", required_argument, NULL, 'u' },
132 { "controllers", required_argument, NULL, 'C' },
133 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
134 { "boot", no_argument, NULL, 'b' },
135 { "uuid", required_argument, NULL, ARG_UUID },
136 { "read-only", no_argument, NULL, ARG_READ_ONLY },
137 { "capability", required_argument, NULL, ARG_CAPABILITY },
138 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
147 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
157 arg_directory = canonicalize_file_name(optarg);
158 if (!arg_directory) {
159 log_error("Failed to canonicalize root directory.");
167 if (!(arg_user = strdup(optarg))) {
168 log_error("Failed to duplicate user name.");
175 strv_free(arg_controllers);
176 arg_controllers = strv_split(optarg, ",");
177 if (!arg_controllers) {
178 log_error("Failed to split controllers list.");
181 strv_uniq(arg_controllers);
185 case ARG_PRIVATE_NETWORK:
186 arg_private_network = true;
198 arg_read_only = true;
201 case ARG_CAPABILITY: {
205 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
209 t = strndup(word, length);
213 if (cap_from_name(t, &cap) < 0) {
214 log_error("Failed to parse capability %s.", t);
220 arg_retain |= 1ULL << (uint64_t) cap;
227 arg_link_journal = LINK_GUEST;
230 case ARG_LINK_JOURNAL:
231 if (streq(optarg, "auto"))
232 arg_link_journal = LINK_AUTO;
233 else if (streq(optarg, "no"))
234 arg_link_journal = LINK_NO;
235 else if (streq(optarg, "guest"))
236 arg_link_journal = LINK_GUEST;
237 else if (streq(optarg, "host"))
238 arg_link_journal = LINK_HOST;
240 log_error("Failed to parse link journal mode %s", optarg);
250 log_error("Unknown option code %c", c);
258 static int mount_all(const char *dest) {
260 typedef struct MountPoint {
269 static const MountPoint mount_table[] = {
270 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
271 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
272 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
273 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
274 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
275 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
276 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
279 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
287 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
290 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
299 t = path_is_mount_point(where, true);
301 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
310 /* Skip this entry if it is not a remount. */
311 if (mount_table[k].what && t > 0)
314 mkdir_p_label(where, 0755);
316 if (mount(mount_table[k].what,
319 mount_table[k].flags,
320 mount_table[k].options) < 0 &&
321 mount_table[k].fatal) {
323 log_error("mount(%s) failed: %m", where);
335 static int setup_timezone(const char *dest) {
340 /* Fix the timezone, if possible */
341 where = strappend(dest, "/etc/localtime");
345 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
346 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
350 where = strappend(dest, "/etc/timezone");
354 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
355 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
362 static int setup_resolv_conf(const char *dest) {
367 if (arg_private_network)
370 /* Fix resolv.conf, if possible */
371 where = strappend(dest, "/etc/resolv.conf");
375 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
376 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
383 static int setup_boot_id(const char *dest) {
384 char *from = NULL, *to = NULL;
391 /* Generate a new randomized boot ID, so that each boot-up of
392 * the container gets a new one */
394 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
400 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
406 r = sd_id128_randomize(&rnd);
408 log_error("Failed to generate random boot id: %s", strerror(-r));
412 snprintf(as_uuid, sizeof(as_uuid),
413 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
414 SD_ID128_FORMAT_VAL(rnd));
415 char_array_0(as_uuid);
417 r = write_one_line_file(from, as_uuid);
419 log_error("Failed to write boot id: %s", strerror(-r));
423 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
424 log_error("Failed to bind mount boot id: %m");
427 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
438 static int copy_devnodes(const char *dest) {
440 static const char devnodes[] =
457 NULSTR_FOREACH(d, devnodes) {
459 char *from = NULL, *to = NULL;
461 asprintf(&from, "/dev/%s", d);
462 asprintf(&to, "%s/dev/%s", dest, d);
465 log_error("Failed to allocate devnode path");
478 if (stat(from, &st) < 0) {
480 if (errno != ENOENT) {
481 log_error("Failed to stat %s: %m", from);
486 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
488 log_error("%s is not a char or block device, cannot copy.", from);
492 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
494 log_error("mknod(%s) failed: %m", dest);
508 static int setup_dev_console(const char *dest, const char *console) {
519 if (stat(console, &st) < 0) {
520 log_error("Failed to stat %s: %m", console);
524 } else if (!S_ISCHR(st.st_mode)) {
525 log_error("/dev/console is not a char device.");
530 r = chmod_and_chown(console, 0600, 0, 0);
532 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
536 if (asprintf(&to, "%s/dev/console", dest) < 0) {
541 /* We need to bind mount the right tty to /dev/console since
542 * ptys can only exist on pts file systems. To have something
543 * to bind mount things on we create a device node first, that
544 * has the right major/minor (note that the major minor
545 * doesn't actually matter here, since we mount it over
548 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
549 log_error("mknod() for /dev/console failed: %m");
554 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
555 log_error("Bind mount for /dev/console failed: %m");
567 static int setup_kmsg(const char *dest, int kmsg_socket) {
568 char *from = NULL, *to = NULL;
572 struct cmsghdr cmsghdr;
573 uint8_t buf[CMSG_SPACE(sizeof(int))];
576 struct cmsghdr *cmsg;
579 assert(kmsg_socket >= 0);
583 /* We create the kmsg FIFO as /dev/kmsg, but immediately
584 * delete it after bind mounting it to /proc/kmsg. While FIFOs
585 * on the reading side behave very similar to /proc/kmsg,
586 * their writing side behaves differently from /dev/kmsg in
587 * that writing blocks when nothing is reading. In order to
588 * avoid any problems with containers deadlocking due to this
589 * we simply make /dev/kmsg unavailable to the container. */
590 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
595 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
600 if (mkfifo(from, 0600) < 0) {
601 log_error("mkfifo() for /dev/kmsg failed: %m");
606 r = chmod_and_chown(from, 0600, 0, 0);
608 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
612 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
613 log_error("Bind mount for /proc/kmsg failed: %m");
618 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
620 log_error("Failed to open fifo: %m");
628 mh.msg_control = &control;
629 mh.msg_controllen = sizeof(control);
631 cmsg = CMSG_FIRSTHDR(&mh);
632 cmsg->cmsg_level = SOL_SOCKET;
633 cmsg->cmsg_type = SCM_RIGHTS;
634 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
635 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
637 mh.msg_controllen = cmsg->cmsg_len;
639 /* Store away the fd in the socket, so that it stays open as
640 * long as we run the child */
641 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
642 close_nointr_nofail(fd);
645 log_error("Failed to send FIFO fd: %m");
650 /* And now make the FIFO unavailable as /dev/kmsg... */
661 static int setup_hostname(void) {
665 hn = path_get_file_name(arg_directory);
671 hostname_cleanup(hn);
674 if (sethostname(hn, strlen(hn)) < 0)
683 static int setup_journal(const char *directory) {
684 sd_id128_t machine_id;
685 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
688 if (arg_link_journal == LINK_NO)
691 p = strappend(directory, "/etc/machine-id");
697 r = read_one_line_file(p, &b);
698 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
702 log_error("Failed to read machine ID: %s", strerror(-r));
707 if (isempty(l) && arg_link_journal == LINK_AUTO) {
712 /* Verify validaty */
713 r = sd_id128_from_string(l, &machine_id);
715 log_error("Failed to parse machine ID: %s", strerror(-r));
720 p = strappend("/var/log/journal/", l);
721 q = strjoin(directory, "/var/log/journal/", l, NULL);
727 if (path_is_mount_point(p, false) > 0 ||
728 path_is_mount_point(q, false) > 0) {
729 if (arg_link_journal != LINK_AUTO) {
730 log_error("Journal already a mount point, refusing.");
739 r = readlink_and_make_absolute(p, &d);
741 if ((arg_link_journal == LINK_GUEST ||
742 arg_link_journal == LINK_AUTO) &&
752 log_error("Failed to remove symlink %s: %m", p);
756 } else if (r == -EINVAL) {
758 if (arg_link_journal == LINK_GUEST &&
761 if (errno == ENOTDIR)
762 log_error("%s already exists and is neither symlink nor directory.", p);
764 log_error("Failed to remove %s: %m", p);
770 } else if (r != -ENOENT) {
771 log_error("readlink(%s) failed: %m", p);
775 if (arg_link_journal == LINK_GUEST) {
777 if (symlink(q, p) < 0) {
778 log_error("Failed to symlink %s to %s: %m", q, p);
789 if (arg_link_journal == LINK_HOST) {
790 r = mkdir_p(p, 0755);
792 log_error("Failed to create %s: %m", p);
796 } else if (access(p, F_OK) < 0) {
801 if (dir_is_empty(q) == 0) {
802 log_error("%s not empty.", q);
807 r = mkdir_p(q, 0755);
809 log_error("Failed to create %s: %m", q);
813 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
814 log_error("Failed to bind mount journal from host into guest: %m");
830 static int drop_capabilities(void) {
831 return capability_bounding_set_drop(~arg_retain, false);
834 static int is_os_tree(const char *path) {
837 /* We use /bin/sh as flag file if something is an OS */
839 if (asprintf(&p, "%s/bin/sh", path) < 0)
845 return r < 0 ? 0 : 1;
848 static int process_pty(int master, sigset_t *mask) {
850 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
851 size_t in_buffer_full = 0, out_buffer_full = 0;
852 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
853 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
854 int ep = -1, signal_fd = -1, r;
856 fd_nonblock(STDIN_FILENO, 1);
857 fd_nonblock(STDOUT_FILENO, 1);
858 fd_nonblock(master, 1);
860 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
862 log_error("signalfd(): %m");
867 ep = epoll_create1(EPOLL_CLOEXEC);
869 log_error("Failed to create epoll: %m");
875 stdin_ev.events = EPOLLIN|EPOLLET;
876 stdin_ev.data.fd = STDIN_FILENO;
879 stdout_ev.events = EPOLLOUT|EPOLLET;
880 stdout_ev.data.fd = STDOUT_FILENO;
883 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
884 master_ev.data.fd = master;
887 signal_ev.events = EPOLLIN;
888 signal_ev.data.fd = signal_fd;
890 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
891 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
892 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
893 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
894 log_error("Failed to regiser fds in epoll: %m");
900 struct epoll_event ev[16];
904 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
907 if (errno == EINTR || errno == EAGAIN)
910 log_error("epoll_wait(): %m");
917 for (i = 0; i < nfds; i++) {
918 if (ev[i].data.fd == STDIN_FILENO) {
920 if (ev[i].events & (EPOLLIN|EPOLLHUP))
921 stdin_readable = true;
923 } else if (ev[i].data.fd == STDOUT_FILENO) {
925 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
926 stdout_writable = true;
928 } else if (ev[i].data.fd == master) {
930 if (ev[i].events & (EPOLLIN|EPOLLHUP))
931 master_readable = true;
933 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
934 master_writable = true;
936 } else if (ev[i].data.fd == signal_fd) {
937 struct signalfd_siginfo sfsi;
940 n = read(signal_fd, &sfsi, sizeof(sfsi));
941 if (n != sizeof(sfsi)) {
944 log_error("Failed to read from signalfd: invalid block size");
949 if (errno != EINTR && errno != EAGAIN) {
950 log_error("Failed to read from signalfd: %m");
956 if (sfsi.ssi_signo == SIGWINCH) {
959 /* The window size changed, let's forward that. */
960 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
961 ioctl(master, TIOCSWINSZ, &ws);
970 while ((stdin_readable && in_buffer_full <= 0) ||
971 (master_writable && in_buffer_full > 0) ||
972 (master_readable && out_buffer_full <= 0) ||
973 (stdout_writable && out_buffer_full > 0)) {
975 if (stdin_readable && in_buffer_full < LINE_MAX) {
977 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
980 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
981 stdin_readable = false;
983 log_error("read(): %m");
988 in_buffer_full += (size_t) k;
991 if (master_writable && in_buffer_full > 0) {
993 k = write(master, in_buffer, in_buffer_full);
996 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
997 master_writable = false;
999 log_error("write(): %m");
1005 assert(in_buffer_full >= (size_t) k);
1006 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1007 in_buffer_full -= k;
1011 if (master_readable && out_buffer_full < LINE_MAX) {
1013 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1016 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1017 master_readable = false;
1019 log_error("read(): %m");
1024 out_buffer_full += (size_t) k;
1027 if (stdout_writable && out_buffer_full > 0) {
1029 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1032 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1033 stdout_writable = false;
1035 log_error("write(): %m");
1041 assert(out_buffer_full >= (size_t) k);
1042 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1043 out_buffer_full -= k;
1051 close_nointr_nofail(ep);
1054 close_nointr_nofail(signal_fd);
1059 int main(int argc, char *argv[]) {
1061 int r = EXIT_FAILURE, k;
1062 char *oldcg = NULL, *newcg = NULL;
1063 char **controller = NULL;
1065 const char *console = NULL;
1066 struct termios saved_attr, raw_attr;
1068 bool saved_attr_valid = false;
1070 int kmsg_socket_pair[2] = { -1, -1 };
1072 log_parse_environment();
1075 r = parse_argv(argc, argv);
1079 if (arg_directory) {
1082 p = path_make_absolute_cwd(arg_directory);
1083 free(arg_directory);
1086 arg_directory = get_current_dir_name();
1088 if (!arg_directory) {
1089 log_error("Failed to determine path");
1093 path_kill_slashes(arg_directory);
1095 if (geteuid() != 0) {
1096 log_error("Need to be root.");
1100 if (sd_booted() <= 0) {
1101 log_error("Not running on a systemd system.");
1105 if (path_equal(arg_directory, "/")) {
1106 log_error("Spawning container on root directory not supported.");
1110 if (is_os_tree(arg_directory) <= 0) {
1111 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1115 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1117 log_error("Failed to determine current cgroup: %s", strerror(-k));
1121 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1122 log_error("Failed to allocate cgroup path.");
1126 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1128 log_error("Failed to create cgroup: %s", strerror(-k));
1132 STRV_FOREACH(controller, arg_controllers) {
1133 k = cg_create_and_attach(*controller, newcg, 0);
1135 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1138 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1140 log_error("Failed to acquire pseudo tty: %m");
1144 console = ptsname(master);
1146 log_error("Failed to determine tty name: %m");
1150 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1152 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1153 ioctl(master, TIOCSWINSZ, &ws);
1155 if (unlockpt(master) < 0) {
1156 log_error("Failed to unlock tty: %m");
1160 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1161 log_error("Failed to get terminal attributes: %m");
1165 saved_attr_valid = true;
1167 raw_attr = saved_attr;
1168 cfmakeraw(&raw_attr);
1169 raw_attr.c_lflag &= ~ECHO;
1171 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1172 log_error("Failed to create kmsg socket pair");
1176 assert_se(sigemptyset(&mask) == 0);
1177 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1178 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1183 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1184 log_error("Failed to set terminal attributes: %m");
1188 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1190 if (errno == EINVAL)
1191 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1193 log_error("clone() failed: %m");
1201 const char *home = NULL;
1202 uid_t uid = (uid_t) -1;
1203 gid_t gid = (gid_t) -1;
1204 const char *envp[] = {
1205 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1206 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1211 NULL, /* container_uuid */
1215 envp[2] = strv_find_prefix(environ, "TERM=");
1217 close_nointr_nofail(master);
1219 close_nointr(STDIN_FILENO);
1220 close_nointr(STDOUT_FILENO);
1221 close_nointr(STDERR_FILENO);
1223 close_all_fds(&kmsg_socket_pair[1], 1);
1225 reset_all_signal_handlers();
1227 assert_se(sigemptyset(&mask) == 0);
1228 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1230 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1231 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1232 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1236 log_error("setsid() failed: %m");
1240 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1241 log_error("PR_SET_PDEATHSIG failed: %m");
1245 /* Mark everything as slave, so that we still
1246 * receive mounts from the real root, but don't
1247 * propagate mounts to the real root. */
1248 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1249 log_error("MS_SLAVE|MS_REC failed: %m");
1253 /* Turn directory into bind mount */
1254 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1255 log_error("Failed to make bind mount.");
1260 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1261 log_error("Failed to make read-only.");
1265 if (mount_all(arg_directory) < 0)
1268 if (copy_devnodes(arg_directory) < 0)
1271 dev_setup(arg_directory);
1273 if (setup_dev_console(arg_directory, console) < 0)
1276 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1279 close_nointr_nofail(kmsg_socket_pair[1]);
1281 if (setup_boot_id(arg_directory) < 0)
1284 if (setup_timezone(arg_directory) < 0)
1287 if (setup_resolv_conf(arg_directory) < 0)
1290 if (setup_journal(arg_directory) < 0)
1293 if (chdir(arg_directory) < 0) {
1294 log_error("chdir(%s) failed: %m", arg_directory);
1298 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1299 log_error("mount(MS_MOVE) failed: %m");
1303 if (chroot(".") < 0) {
1304 log_error("chroot() failed: %m");
1308 if (chdir("/") < 0) {
1309 log_error("chdir() failed: %m");
1317 if (drop_capabilities() < 0) {
1318 log_error("drop_capabilities() failed: %m");
1324 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1325 log_error("get_user_creds() failed: %m");
1329 if (mkdir_parents_label(home, 0775) < 0) {
1330 log_error("mkdir_parents_label() failed: %m");
1334 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1335 log_error("mkdir_safe_label() failed: %m");
1339 if (initgroups((const char*)arg_user, gid) < 0) {
1340 log_error("initgroups() failed: %m");
1344 if (setresgid(gid, gid, gid) < 0) {
1345 log_error("setregid() failed: %m");
1349 if (setresuid(uid, uid, uid) < 0) {
1350 log_error("setreuid() failed: %m");
1355 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1356 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1357 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1363 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1375 /* Automatically search for the init system */
1377 l = 1 + argc - optind;
1378 a = newa(char*, l + 1);
1379 memcpy(a + 1, argv + optind, l * sizeof(char*));
1381 a[0] = (char*) "/usr/lib/systemd/systemd";
1382 execve(a[0], a, (char**) envp);
1384 a[0] = (char*) "/lib/systemd/systemd";
1385 execve(a[0], a, (char**) envp);
1387 a[0] = (char*) "/sbin/init";
1388 execve(a[0], a, (char**) envp);
1389 } else if (argc > optind)
1390 execvpe(argv[optind], argv + optind, (char**) envp);
1392 chdir(home ? home : "/root");
1393 execle("/bin/bash", "-bash", NULL, (char**) envp);
1396 log_error("execv() failed: %m");
1399 _exit(EXIT_FAILURE);
1402 if (process_pty(master, &mask) < 0)
1406 if (saved_attr_valid)
1407 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1409 r = wait_for_terminate(pid, &status);
1415 if (status.si_code == CLD_EXITED) {
1416 if (status.si_status != 0) {
1417 log_error("Container failed with error code %i.", status.si_status);
1418 r = status.si_status;
1422 log_debug("Container exited successfully.");
1424 } else if (status.si_code == CLD_KILLED &&
1425 status.si_status == SIGINT) {
1426 log_info("Container has been shut down.");
1429 } else if (status.si_code == CLD_KILLED &&
1430 status.si_status == SIGHUP) {
1431 log_info("Container is being rebooted.");
1433 } else if (status.si_code == CLD_KILLED ||
1434 status.si_code == CLD_DUMPED) {
1436 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1440 log_error("Container failed due to unknown reason.");
1447 if (saved_attr_valid)
1448 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1451 close_nointr_nofail(master);
1453 close_pipe(kmsg_socket_pair);
1456 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1459 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1461 free(arg_directory);
1462 strv_free(arg_controllers);