1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
59 typedef enum LinkJournal {
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
76 (1ULL << CAP_DAC_OVERRIDE) |
77 (1ULL << CAP_DAC_READ_SEARCH) |
78 (1ULL << CAP_FOWNER) |
79 (1ULL << CAP_FSETID) |
80 (1ULL << CAP_IPC_OWNER) |
83 (1ULL << CAP_LINUX_IMMUTABLE) |
84 (1ULL << CAP_NET_BIND_SERVICE) |
85 (1ULL << CAP_NET_BROADCAST) |
86 (1ULL << CAP_NET_RAW) |
87 (1ULL << CAP_SETGID) |
88 (1ULL << CAP_SETFCAP) |
89 (1ULL << CAP_SETPCAP) |
90 (1ULL << CAP_SETUID) |
91 (1ULL << CAP_SYS_ADMIN) |
92 (1ULL << CAP_SYS_CHROOT) |
93 (1ULL << CAP_SYS_NICE) |
94 (1ULL << CAP_SYS_PTRACE) |
95 (1ULL << CAP_SYS_TTY_CONFIG) |
96 (1ULL << CAP_SYS_RESOURCE) |
97 (1ULL << CAP_SYS_BOOT);
99 static int help(void) {
101 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103 " -h --help Show this help\n"
104 " -D --directory=NAME Root directory for the container\n"
105 " -b --boot Boot up full system (i.e. invoke init)\n"
106 " -u --user=USER Run the command under specified user or uid\n"
107 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
108 " --uuid=UUID Set a specific machine UUID for the container\n"
109 " --private-network Disable network in container\n"
110 " --read-only Mount the root directory read-only\n"
111 " --capability=CAP In addition to the default, retain specified capability\n"
112 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
113 " -j Equivalent to --link-journal=host\n",
114 program_invocation_short_name);
119 static int parse_argv(int argc, char *argv[]) {
122 ARG_PRIVATE_NETWORK = 0x100,
129 static const struct option options[] = {
130 { "help", no_argument, NULL, 'h' },
131 { "directory", required_argument, NULL, 'D' },
132 { "user", required_argument, NULL, 'u' },
133 { "controllers", required_argument, NULL, 'C' },
134 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
135 { "boot", no_argument, NULL, 'b' },
136 { "uuid", required_argument, NULL, ARG_UUID },
137 { "read-only", no_argument, NULL, ARG_READ_ONLY },
138 { "capability", required_argument, NULL, ARG_CAPABILITY },
139 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
148 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158 arg_directory = canonicalize_file_name(optarg);
159 if (!arg_directory) {
160 log_error("Failed to canonicalize root directory.");
168 if (!(arg_user = strdup(optarg))) {
169 log_error("Failed to duplicate user name.");
176 strv_free(arg_controllers);
177 arg_controllers = strv_split(optarg, ",");
178 if (!arg_controllers) {
179 log_error("Failed to split controllers list.");
182 strv_uniq(arg_controllers);
186 case ARG_PRIVATE_NETWORK:
187 arg_private_network = true;
199 arg_read_only = true;
202 case ARG_CAPABILITY: {
206 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
210 t = strndup(word, length);
214 if (cap_from_name(t, &cap) < 0) {
215 log_error("Failed to parse capability %s.", t);
221 arg_retain |= 1ULL << (uint64_t) cap;
228 arg_link_journal = LINK_GUEST;
231 case ARG_LINK_JOURNAL:
232 if (streq(optarg, "auto"))
233 arg_link_journal = LINK_AUTO;
234 else if (streq(optarg, "no"))
235 arg_link_journal = LINK_NO;
236 else if (streq(optarg, "guest"))
237 arg_link_journal = LINK_GUEST;
238 else if (streq(optarg, "host"))
239 arg_link_journal = LINK_HOST;
241 log_error("Failed to parse link journal mode %s", optarg);
251 log_error("Unknown option code %c", c);
259 static int mount_all(const char *dest) {
261 typedef struct MountPoint {
270 static const MountPoint mount_table[] = {
271 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
272 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
277 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
280 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
281 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
287 char _cleanup_free_ *where = NULL;
289 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
292 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
301 t = path_is_mount_point(where, true);
303 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
315 mkdir_p_label(where, 0755);
317 if (mount(mount_table[k].what,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
324 log_error("mount(%s) failed: %m", where);
334 static int setup_timezone(const char *dest) {
339 /* Fix the timezone, if possible */
340 where = strappend(dest, "/etc/localtime");
344 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
349 where = strappend(dest, "/etc/timezone");
353 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
361 static int setup_resolv_conf(const char *dest) {
366 if (arg_private_network)
369 /* Fix resolv.conf, if possible */
370 where = strappend(dest, "/etc/resolv.conf");
374 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
382 static int setup_boot_id(const char *dest) {
383 char _cleanup_free_ *from = NULL, *to = NULL;
390 /* Generate a new randomized boot ID, so that each boot-up of
391 * the container gets a new one */
393 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
398 r = sd_id128_randomize(&rnd);
400 log_error("Failed to generate random boot id: %s", strerror(-r));
404 snprintf(as_uuid, sizeof(as_uuid),
405 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
406 SD_ID128_FORMAT_VAL(rnd));
407 char_array_0(as_uuid);
409 r = write_one_line_file(from, as_uuid);
411 log_error("Failed to write boot id: %s", strerror(-r));
415 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
416 log_error("Failed to bind mount boot id: %m");
419 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
425 static int copy_devnodes(const char *dest) {
427 static const char devnodes[] =
444 NULSTR_FOREACH(d, devnodes) {
446 char _cleanup_free_ *from = NULL, *to = NULL;
448 asprintf(&from, "/dev/%s", d);
449 asprintf(&to, "%s/dev/%s", dest, d);
460 if (stat(from, &st) < 0) {
462 if (errno != ENOENT) {
463 log_error("Failed to stat %s: %m", from);
468 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
470 log_error("%s is not a char or block device, cannot copy", from);
474 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
476 log_error("mknod(%s) failed: %m", dest);
487 static int setup_dev_console(const char *dest, const char *console) {
489 char _cleanup_free_ *to = NULL;
498 if (stat(console, &st) < 0) {
499 log_error("Failed to stat %s: %m", console);
503 } else if (!S_ISCHR(st.st_mode)) {
504 log_error("/dev/console is not a char device.");
509 r = chmod_and_chown(console, 0600, 0, 0);
511 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
515 if (asprintf(&to, "%s/dev/console", dest) < 0) {
520 /* We need to bind mount the right tty to /dev/console since
521 * ptys can only exist on pts file systems. To have something
522 * to bind mount things on we create a device node first, that
523 * has the right major/minor (note that the major minor
524 * doesn't actually matter here, since we mount it over
527 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
528 log_error("mknod() for /dev/console failed: %m");
533 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
534 log_error("Bind mount for /dev/console failed: %m");
545 static int setup_kmsg(const char *dest, int kmsg_socket) {
546 char _cleanup_free_ *from = NULL, *to = NULL;
550 struct cmsghdr cmsghdr;
551 uint8_t buf[CMSG_SPACE(sizeof(int))];
554 struct cmsghdr *cmsg;
557 assert(kmsg_socket >= 0);
561 /* We create the kmsg FIFO as /dev/kmsg, but immediately
562 * delete it after bind mounting it to /proc/kmsg. While FIFOs
563 * on the reading side behave very similar to /proc/kmsg,
564 * their writing side behaves differently from /dev/kmsg in
565 * that writing blocks when nothing is reading. In order to
566 * avoid any problems with containers deadlocking due to this
567 * we simply make /dev/kmsg unavailable to the container. */
568 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
573 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
578 if (mkfifo(from, 0600) < 0) {
579 log_error("mkfifo() for /dev/kmsg failed: %m");
584 r = chmod_and_chown(from, 0600, 0, 0);
586 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
590 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
591 log_error("Bind mount for /proc/kmsg failed: %m");
596 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
598 log_error("Failed to open fifo: %m");
606 mh.msg_control = &control;
607 mh.msg_controllen = sizeof(control);
609 cmsg = CMSG_FIRSTHDR(&mh);
610 cmsg->cmsg_level = SOL_SOCKET;
611 cmsg->cmsg_type = SCM_RIGHTS;
612 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
613 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
615 mh.msg_controllen = cmsg->cmsg_len;
617 /* Store away the fd in the socket, so that it stays open as
618 * long as we run the child */
619 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
620 close_nointr_nofail(fd);
623 log_error("Failed to send FIFO fd: %m");
628 /* And now make the FIFO unavailable as /dev/kmsg... */
637 static int setup_hostname(void) {
641 hn = path_get_file_name(arg_directory);
647 hostname_cleanup(hn);
650 if (sethostname(hn, strlen(hn)) < 0)
659 static int setup_journal(const char *directory) {
660 sd_id128_t machine_id;
661 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
664 if (arg_link_journal == LINK_NO)
667 p = strappend(directory, "/etc/machine-id");
673 r = read_one_line_file(p, &b);
674 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
678 log_error("Failed to read machine ID: %s", strerror(-r));
683 if (isempty(l) && arg_link_journal == LINK_AUTO) {
688 /* Verify validaty */
689 r = sd_id128_from_string(l, &machine_id);
691 log_error("Failed to parse machine ID: %s", strerror(-r));
696 p = strappend("/var/log/journal/", l);
697 q = strjoin(directory, "/var/log/journal/", l, NULL);
703 if (path_is_mount_point(p, false) > 0 ||
704 path_is_mount_point(q, false) > 0) {
705 if (arg_link_journal != LINK_AUTO) {
706 log_error("Journal already a mount point, refusing.");
715 r = readlink_and_make_absolute(p, &d);
717 if ((arg_link_journal == LINK_GUEST ||
718 arg_link_journal == LINK_AUTO) &&
728 log_error("Failed to remove symlink %s: %m", p);
732 } else if (r == -EINVAL) {
734 if (arg_link_journal == LINK_GUEST &&
737 if (errno == ENOTDIR)
738 log_error("%s already exists and is neither symlink nor directory.", p);
740 log_error("Failed to remove %s: %m", p);
746 } else if (r != -ENOENT) {
747 log_error("readlink(%s) failed: %m", p);
751 if (arg_link_journal == LINK_GUEST) {
753 if (symlink(q, p) < 0) {
754 log_error("Failed to symlink %s to %s: %m", q, p);
765 if (arg_link_journal == LINK_HOST) {
766 r = mkdir_p(p, 0755);
768 log_error("Failed to create %s: %m", p);
772 } else if (access(p, F_OK) < 0) {
777 if (dir_is_empty(q) == 0) {
778 log_error("%s not empty.", q);
783 r = mkdir_p(q, 0755);
785 log_error("Failed to create %s: %m", q);
789 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
790 log_error("Failed to bind mount journal from host into guest: %m");
806 static int drop_capabilities(void) {
807 return capability_bounding_set_drop(~arg_retain, false);
810 static int is_os_tree(const char *path) {
813 /* We use /bin/sh as flag file if something is an OS */
815 if (asprintf(&p, "%s/bin/sh", path) < 0)
821 return r < 0 ? 0 : 1;
824 static int process_pty(int master, sigset_t *mask) {
826 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
827 size_t in_buffer_full = 0, out_buffer_full = 0;
828 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
829 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
830 int ep = -1, signal_fd = -1, r;
832 fd_nonblock(STDIN_FILENO, 1);
833 fd_nonblock(STDOUT_FILENO, 1);
834 fd_nonblock(master, 1);
836 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
838 log_error("signalfd(): %m");
843 ep = epoll_create1(EPOLL_CLOEXEC);
845 log_error("Failed to create epoll: %m");
851 stdin_ev.events = EPOLLIN|EPOLLET;
852 stdin_ev.data.fd = STDIN_FILENO;
855 stdout_ev.events = EPOLLOUT|EPOLLET;
856 stdout_ev.data.fd = STDOUT_FILENO;
859 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
860 master_ev.data.fd = master;
863 signal_ev.events = EPOLLIN;
864 signal_ev.data.fd = signal_fd;
866 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
867 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
868 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
869 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
870 log_error("Failed to regiser fds in epoll: %m");
876 struct epoll_event ev[16];
880 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
883 if (errno == EINTR || errno == EAGAIN)
886 log_error("epoll_wait(): %m");
893 for (i = 0; i < nfds; i++) {
894 if (ev[i].data.fd == STDIN_FILENO) {
896 if (ev[i].events & (EPOLLIN|EPOLLHUP))
897 stdin_readable = true;
899 } else if (ev[i].data.fd == STDOUT_FILENO) {
901 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
902 stdout_writable = true;
904 } else if (ev[i].data.fd == master) {
906 if (ev[i].events & (EPOLLIN|EPOLLHUP))
907 master_readable = true;
909 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
910 master_writable = true;
912 } else if (ev[i].data.fd == signal_fd) {
913 struct signalfd_siginfo sfsi;
916 n = read(signal_fd, &sfsi, sizeof(sfsi));
917 if (n != sizeof(sfsi)) {
920 log_error("Failed to read from signalfd: invalid block size");
925 if (errno != EINTR && errno != EAGAIN) {
926 log_error("Failed to read from signalfd: %m");
932 if (sfsi.ssi_signo == SIGWINCH) {
935 /* The window size changed, let's forward that. */
936 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
937 ioctl(master, TIOCSWINSZ, &ws);
946 while ((stdin_readable && in_buffer_full <= 0) ||
947 (master_writable && in_buffer_full > 0) ||
948 (master_readable && out_buffer_full <= 0) ||
949 (stdout_writable && out_buffer_full > 0)) {
951 if (stdin_readable && in_buffer_full < LINE_MAX) {
953 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
956 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
957 stdin_readable = false;
959 log_error("read(): %m");
964 in_buffer_full += (size_t) k;
967 if (master_writable && in_buffer_full > 0) {
969 k = write(master, in_buffer, in_buffer_full);
972 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
973 master_writable = false;
975 log_error("write(): %m");
981 assert(in_buffer_full >= (size_t) k);
982 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
987 if (master_readable && out_buffer_full < LINE_MAX) {
989 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
992 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
993 master_readable = false;
995 log_error("read(): %m");
1000 out_buffer_full += (size_t) k;
1003 if (stdout_writable && out_buffer_full > 0) {
1005 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1008 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1009 stdout_writable = false;
1011 log_error("write(): %m");
1017 assert(out_buffer_full >= (size_t) k);
1018 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1019 out_buffer_full -= k;
1027 close_nointr_nofail(ep);
1030 close_nointr_nofail(signal_fd);
1035 int main(int argc, char *argv[]) {
1037 int r = EXIT_FAILURE, k;
1038 char *oldcg = NULL, *newcg = NULL;
1039 char **controller = NULL;
1041 const char *console = NULL;
1042 struct termios saved_attr, raw_attr;
1044 bool saved_attr_valid = false;
1046 int kmsg_socket_pair[2] = { -1, -1 };
1048 log_parse_environment();
1051 r = parse_argv(argc, argv);
1055 if (arg_directory) {
1058 p = path_make_absolute_cwd(arg_directory);
1059 free(arg_directory);
1062 arg_directory = get_current_dir_name();
1064 if (!arg_directory) {
1065 log_error("Failed to determine path");
1069 path_kill_slashes(arg_directory);
1071 if (geteuid() != 0) {
1072 log_error("Need to be root.");
1076 if (sd_booted() <= 0) {
1077 log_error("Not running on a systemd system.");
1081 if (path_equal(arg_directory, "/")) {
1082 log_error("Spawning container on root directory not supported.");
1086 if (is_os_tree(arg_directory) <= 0) {
1087 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1091 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1093 log_error("Failed to determine current cgroup: %s", strerror(-k));
1097 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1098 log_error("Failed to allocate cgroup path.");
1102 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1104 log_error("Failed to create cgroup: %s", strerror(-k));
1108 STRV_FOREACH(controller, arg_controllers) {
1109 k = cg_create_and_attach(*controller, newcg, 0);
1111 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1114 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1116 log_error("Failed to acquire pseudo tty: %m");
1120 console = ptsname(master);
1122 log_error("Failed to determine tty name: %m");
1126 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1128 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1129 ioctl(master, TIOCSWINSZ, &ws);
1131 if (unlockpt(master) < 0) {
1132 log_error("Failed to unlock tty: %m");
1136 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1137 log_error("Failed to get terminal attributes: %m");
1141 saved_attr_valid = true;
1143 raw_attr = saved_attr;
1144 cfmakeraw(&raw_attr);
1145 raw_attr.c_lflag &= ~ECHO;
1147 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1148 log_error("Failed to create kmsg socket pair");
1152 assert_se(sigemptyset(&mask) == 0);
1153 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1154 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1159 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1160 log_error("Failed to set terminal attributes: %m");
1164 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1166 if (errno == EINVAL)
1167 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1169 log_error("clone() failed: %m");
1177 const char *home = NULL;
1178 uid_t uid = (uid_t) -1;
1179 gid_t gid = (gid_t) -1;
1180 const char *envp[] = {
1181 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1182 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1187 NULL, /* container_uuid */
1191 envp[2] = strv_find_prefix(environ, "TERM=");
1193 close_nointr_nofail(master);
1195 close_nointr(STDIN_FILENO);
1196 close_nointr(STDOUT_FILENO);
1197 close_nointr(STDERR_FILENO);
1199 close_all_fds(&kmsg_socket_pair[1], 1);
1201 reset_all_signal_handlers();
1203 assert_se(sigemptyset(&mask) == 0);
1204 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1206 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1207 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1208 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1212 log_error("setsid() failed: %m");
1216 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1217 log_error("PR_SET_PDEATHSIG failed: %m");
1221 /* Mark everything as slave, so that we still
1222 * receive mounts from the real root, but don't
1223 * propagate mounts to the real root. */
1224 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1225 log_error("MS_SLAVE|MS_REC failed: %m");
1229 /* Turn directory into bind mount */
1230 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1231 log_error("Failed to make bind mount.");
1236 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1237 log_error("Failed to make read-only.");
1241 if (mount_all(arg_directory) < 0)
1244 if (copy_devnodes(arg_directory) < 0)
1247 dev_setup(arg_directory);
1249 if (setup_dev_console(arg_directory, console) < 0)
1252 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1255 close_nointr_nofail(kmsg_socket_pair[1]);
1257 if (setup_boot_id(arg_directory) < 0)
1260 if (setup_timezone(arg_directory) < 0)
1263 if (setup_resolv_conf(arg_directory) < 0)
1266 if (setup_journal(arg_directory) < 0)
1269 if (chdir(arg_directory) < 0) {
1270 log_error("chdir(%s) failed: %m", arg_directory);
1274 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1275 log_error("mount(MS_MOVE) failed: %m");
1279 if (chroot(".") < 0) {
1280 log_error("chroot() failed: %m");
1284 if (chdir("/") < 0) {
1285 log_error("chdir() failed: %m");
1293 if (drop_capabilities() < 0) {
1294 log_error("drop_capabilities() failed: %m");
1300 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1301 log_error("get_user_creds() failed: %m");
1305 if (mkdir_parents_label(home, 0775) < 0) {
1306 log_error("mkdir_parents_label() failed: %m");
1310 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1311 log_error("mkdir_safe_label() failed: %m");
1315 if (initgroups((const char*)arg_user, gid) < 0) {
1316 log_error("initgroups() failed: %m");
1320 if (setresgid(gid, gid, gid) < 0) {
1321 log_error("setregid() failed: %m");
1325 if (setresuid(uid, uid, uid) < 0) {
1326 log_error("setreuid() failed: %m");
1331 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1332 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1333 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1339 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1351 /* Automatically search for the init system */
1353 l = 1 + argc - optind;
1354 a = newa(char*, l + 1);
1355 memcpy(a + 1, argv + optind, l * sizeof(char*));
1357 a[0] = (char*) "/usr/lib/systemd/systemd";
1358 execve(a[0], a, (char**) envp);
1360 a[0] = (char*) "/lib/systemd/systemd";
1361 execve(a[0], a, (char**) envp);
1363 a[0] = (char*) "/sbin/init";
1364 execve(a[0], a, (char**) envp);
1365 } else if (argc > optind)
1366 execvpe(argv[optind], argv + optind, (char**) envp);
1368 chdir(home ? home : "/root");
1369 execle("/bin/bash", "-bash", NULL, (char**) envp);
1372 log_error("execv() failed: %m");
1375 _exit(EXIT_FAILURE);
1378 if (process_pty(master, &mask) < 0)
1382 if (saved_attr_valid)
1383 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1385 r = wait_for_terminate(pid, &status);
1391 if (status.si_code == CLD_EXITED) {
1392 if (status.si_status != 0) {
1393 log_error("Container failed with error code %i.", status.si_status);
1394 r = status.si_status;
1398 log_debug("Container exited successfully.");
1400 } else if (status.si_code == CLD_KILLED &&
1401 status.si_status == SIGINT) {
1402 log_info("Container has been shut down.");
1405 } else if (status.si_code == CLD_KILLED &&
1406 status.si_status == SIGHUP) {
1407 log_info("Container is being rebooted.");
1409 } else if (status.si_code == CLD_KILLED ||
1410 status.si_code == CLD_DUMPED) {
1412 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1416 log_error("Container failed due to unknown reason.");
1423 if (saved_attr_valid)
1424 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1427 close_nointr_nofail(master);
1429 close_pipe(kmsg_socket_pair);
1432 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1435 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1437 free(arg_directory);
1438 strv_free(arg_controllers);