1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
56 #include "dev-setup.h"
58 typedef enum LinkJournal {
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
75 (1ULL << CAP_DAC_OVERRIDE) |
76 (1ULL << CAP_DAC_READ_SEARCH) |
77 (1ULL << CAP_FOWNER) |
78 (1ULL << CAP_FSETID) |
79 (1ULL << CAP_IPC_OWNER) |
82 (1ULL << CAP_LINUX_IMMUTABLE) |
83 (1ULL << CAP_NET_BIND_SERVICE) |
84 (1ULL << CAP_NET_BROADCAST) |
85 (1ULL << CAP_NET_RAW) |
86 (1ULL << CAP_SETGID) |
87 (1ULL << CAP_SETFCAP) |
88 (1ULL << CAP_SETPCAP) |
89 (1ULL << CAP_SETUID) |
90 (1ULL << CAP_SYS_ADMIN) |
91 (1ULL << CAP_SYS_CHROOT) |
92 (1ULL << CAP_SYS_NICE) |
93 (1ULL << CAP_SYS_PTRACE) |
94 (1ULL << CAP_SYS_TTY_CONFIG) |
95 (1ULL << CAP_SYS_RESOURCE) |
96 (1ULL << CAP_SYS_BOOT);
98 static int help(void) {
100 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
101 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
102 " -h --help Show this help\n"
103 " -D --directory=NAME Root directory for the container\n"
104 " -b --boot Boot up full system (i.e. invoke init)\n"
105 " -u --user=USER Run the command under specified user or uid\n"
106 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
107 " --uuid=UUID Set a specific machine UUID for the container\n"
108 " --private-network Disable network in container\n"
109 " --read-only Mount the root directory read-only\n"
110 " --capability=CAP In addition to the default, retain specified capability\n"
111 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
112 " -j Equivalent to --link-journal=host\n",
113 program_invocation_short_name);
118 static int parse_argv(int argc, char *argv[]) {
121 ARG_PRIVATE_NETWORK = 0x100,
128 static const struct option options[] = {
129 { "help", no_argument, NULL, 'h' },
130 { "directory", required_argument, NULL, 'D' },
131 { "user", required_argument, NULL, 'u' },
132 { "controllers", required_argument, NULL, 'C' },
133 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
134 { "boot", no_argument, NULL, 'b' },
135 { "uuid", required_argument, NULL, ARG_UUID },
136 { "read-only", no_argument, NULL, ARG_READ_ONLY },
137 { "capability", required_argument, NULL, ARG_CAPABILITY },
138 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
147 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
157 arg_directory = canonicalize_file_name(optarg);
158 if (!arg_directory) {
159 log_error("Failed to canonicalize root directory.");
167 if (!(arg_user = strdup(optarg))) {
168 log_error("Failed to duplicate user name.");
175 strv_free(arg_controllers);
176 arg_controllers = strv_split(optarg, ",");
177 if (!arg_controllers) {
178 log_error("Failed to split controllers list.");
181 strv_uniq(arg_controllers);
185 case ARG_PRIVATE_NETWORK:
186 arg_private_network = true;
198 arg_read_only = true;
201 case ARG_CAPABILITY: {
205 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
209 t = strndup(word, length);
213 if (cap_from_name(t, &cap) < 0) {
214 log_error("Failed to parse capability %s.", t);
220 arg_retain |= 1ULL << (uint64_t) cap;
227 arg_link_journal = LINK_GUEST;
230 case ARG_LINK_JOURNAL:
231 if (streq(optarg, "auto"))
232 arg_link_journal = LINK_AUTO;
233 else if (streq(optarg, "no"))
234 arg_link_journal = LINK_NO;
235 else if (streq(optarg, "guest"))
236 arg_link_journal = LINK_GUEST;
237 else if (streq(optarg, "host"))
238 arg_link_journal = LINK_HOST;
240 log_error("Failed to parse link journal mode %s", optarg);
250 log_error("Unknown option code %c", c);
258 static int mount_all(const char *dest) {
260 typedef struct MountPoint {
269 static const MountPoint mount_table[] = {
270 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
271 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
272 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
273 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
274 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
275 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
276 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
277 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
279 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
280 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
288 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
291 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
300 t = path_is_mount_point(where, true);
302 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
315 mkdir_p_label(where, 0755);
317 if (mount(mount_table[k].what,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
324 log_error("mount(%s) failed: %m", where);
336 static int setup_timezone(const char *dest) {
341 /* Fix the timezone, if possible */
342 where = strappend(dest, "/etc/localtime");
346 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
347 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
351 where = strappend(dest, "/etc/timezone");
355 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
356 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
363 static int setup_resolv_conf(const char *dest) {
368 if (arg_private_network)
371 /* Fix resolv.conf, if possible */
372 where = strappend(dest, "/etc/resolv.conf");
376 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
377 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
384 static int setup_boot_id(const char *dest) {
385 char *from = NULL, *to = NULL;
392 /* Generate a new randomized boot ID, so that each boot-up of
393 * the container gets a new one */
395 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
401 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
407 r = sd_id128_randomize(&rnd);
409 log_error("Failed to generate random boot id: %s", strerror(-r));
413 snprintf(as_uuid, sizeof(as_uuid),
414 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
415 SD_ID128_FORMAT_VAL(rnd));
416 char_array_0(as_uuid);
418 r = write_one_line_file(from, as_uuid);
420 log_error("Failed to write boot id: %s", strerror(-r));
424 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
425 log_error("Failed to bind mount boot id: %m");
428 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
439 static int copy_devnodes(const char *dest) {
441 static const char devnodes[] =
458 NULSTR_FOREACH(d, devnodes) {
460 char *from = NULL, *to = NULL;
462 asprintf(&from, "/dev/%s", d);
463 asprintf(&to, "%s/dev/%s", dest, d);
466 log_error("Failed to allocate devnode path");
479 if (stat(from, &st) < 0) {
481 if (errno != ENOENT) {
482 log_error("Failed to stat %s: %m", from);
487 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
489 log_error("%s is not a char or block device, cannot copy.", from);
493 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
495 log_error("mknod(%s) failed: %m", dest);
509 static int setup_dev_console(const char *dest, const char *console) {
520 if (stat(console, &st) < 0) {
521 log_error("Failed to stat %s: %m", console);
525 } else if (!S_ISCHR(st.st_mode)) {
526 log_error("/dev/console is not a char device.");
531 r = chmod_and_chown(console, 0600, 0, 0);
533 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
537 if (asprintf(&to, "%s/dev/console", dest) < 0) {
542 /* We need to bind mount the right tty to /dev/console since
543 * ptys can only exist on pts file systems. To have something
544 * to bind mount things on we create a device node first, that
545 * has the right major/minor (note that the major minor
546 * doesn't actually matter here, since we mount it over
549 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
550 log_error("mknod() for /dev/console failed: %m");
555 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
556 log_error("Bind mount for /dev/console failed: %m");
568 static int setup_kmsg(const char *dest, int kmsg_socket) {
569 char *from = NULL, *to = NULL;
573 struct cmsghdr cmsghdr;
574 uint8_t buf[CMSG_SPACE(sizeof(int))];
577 struct cmsghdr *cmsg;
580 assert(kmsg_socket >= 0);
584 /* We create the kmsg FIFO as /dev/kmsg, but immediately
585 * delete it after bind mounting it to /proc/kmsg. While FIFOs
586 * on the reading side behave very similar to /proc/kmsg,
587 * their writing side behaves differently from /dev/kmsg in
588 * that writing blocks when nothing is reading. In order to
589 * avoid any problems with containers deadlocking due to this
590 * we simply make /dev/kmsg unavailable to the container. */
591 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
596 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
601 if (mkfifo(from, 0600) < 0) {
602 log_error("mkfifo() for /dev/kmsg failed: %m");
607 r = chmod_and_chown(from, 0600, 0, 0);
609 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
613 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
614 log_error("Bind mount for /proc/kmsg failed: %m");
619 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
621 log_error("Failed to open fifo: %m");
629 mh.msg_control = &control;
630 mh.msg_controllen = sizeof(control);
632 cmsg = CMSG_FIRSTHDR(&mh);
633 cmsg->cmsg_level = SOL_SOCKET;
634 cmsg->cmsg_type = SCM_RIGHTS;
635 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
636 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
638 mh.msg_controllen = cmsg->cmsg_len;
640 /* Store away the fd in the socket, so that it stays open as
641 * long as we run the child */
642 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
643 close_nointr_nofail(fd);
646 log_error("Failed to send FIFO fd: %m");
651 /* And now make the FIFO unavailable as /dev/kmsg... */
662 static int setup_hostname(void) {
666 hn = path_get_file_name(arg_directory);
672 hostname_cleanup(hn);
675 if (sethostname(hn, strlen(hn)) < 0)
684 static int setup_journal(const char *directory) {
685 sd_id128_t machine_id;
686 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
689 if (arg_link_journal == LINK_NO)
692 p = strappend(directory, "/etc/machine-id");
698 r = read_one_line_file(p, &b);
699 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
703 log_error("Failed to read machine ID: %s", strerror(-r));
708 if (isempty(l) && arg_link_journal == LINK_AUTO) {
713 /* Verify validaty */
714 r = sd_id128_from_string(l, &machine_id);
716 log_error("Failed to parse machine ID: %s", strerror(-r));
721 p = strappend("/var/log/journal/", l);
722 q = strjoin(directory, "/var/log/journal/", l, NULL);
728 if (path_is_mount_point(p, false) > 0 ||
729 path_is_mount_point(q, false) > 0) {
730 if (arg_link_journal != LINK_AUTO) {
731 log_error("Journal already a mount point, refusing.");
740 r = readlink_and_make_absolute(p, &d);
742 if ((arg_link_journal == LINK_GUEST ||
743 arg_link_journal == LINK_AUTO) &&
753 log_error("Failed to remove symlink %s: %m", p);
757 } else if (r == -EINVAL) {
759 if (arg_link_journal == LINK_GUEST &&
762 if (errno == ENOTDIR)
763 log_error("%s already exists and is neither symlink nor directory.", p);
765 log_error("Failed to remove %s: %m", p);
771 } else if (r != -ENOENT) {
772 log_error("readlink(%s) failed: %m", p);
776 if (arg_link_journal == LINK_GUEST) {
778 if (symlink(q, p) < 0) {
779 log_error("Failed to symlink %s to %s: %m", q, p);
790 if (arg_link_journal == LINK_HOST) {
791 r = mkdir_p(p, 0755);
793 log_error("Failed to create %s: %m", p);
797 } else if (access(p, F_OK) < 0) {
802 if (dir_is_empty(q) == 0) {
803 log_error("%s not empty.", q);
808 r = mkdir_p(q, 0755);
810 log_error("Failed to create %s: %m", q);
814 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
815 log_error("Failed to bind mount journal from host into guest: %m");
831 static int drop_capabilities(void) {
832 return capability_bounding_set_drop(~arg_retain, false);
835 static int is_os_tree(const char *path) {
838 /* We use /bin/sh as flag file if something is an OS */
840 if (asprintf(&p, "%s/bin/sh", path) < 0)
846 return r < 0 ? 0 : 1;
849 static int process_pty(int master, sigset_t *mask) {
851 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
852 size_t in_buffer_full = 0, out_buffer_full = 0;
853 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
854 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
855 int ep = -1, signal_fd = -1, r;
857 fd_nonblock(STDIN_FILENO, 1);
858 fd_nonblock(STDOUT_FILENO, 1);
859 fd_nonblock(master, 1);
861 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
863 log_error("signalfd(): %m");
868 ep = epoll_create1(EPOLL_CLOEXEC);
870 log_error("Failed to create epoll: %m");
876 stdin_ev.events = EPOLLIN|EPOLLET;
877 stdin_ev.data.fd = STDIN_FILENO;
880 stdout_ev.events = EPOLLOUT|EPOLLET;
881 stdout_ev.data.fd = STDOUT_FILENO;
884 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
885 master_ev.data.fd = master;
888 signal_ev.events = EPOLLIN;
889 signal_ev.data.fd = signal_fd;
891 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
892 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
893 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
894 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
895 log_error("Failed to regiser fds in epoll: %m");
901 struct epoll_event ev[16];
905 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
908 if (errno == EINTR || errno == EAGAIN)
911 log_error("epoll_wait(): %m");
918 for (i = 0; i < nfds; i++) {
919 if (ev[i].data.fd == STDIN_FILENO) {
921 if (ev[i].events & (EPOLLIN|EPOLLHUP))
922 stdin_readable = true;
924 } else if (ev[i].data.fd == STDOUT_FILENO) {
926 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
927 stdout_writable = true;
929 } else if (ev[i].data.fd == master) {
931 if (ev[i].events & (EPOLLIN|EPOLLHUP))
932 master_readable = true;
934 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
935 master_writable = true;
937 } else if (ev[i].data.fd == signal_fd) {
938 struct signalfd_siginfo sfsi;
941 n = read(signal_fd, &sfsi, sizeof(sfsi));
942 if (n != sizeof(sfsi)) {
945 log_error("Failed to read from signalfd: invalid block size");
950 if (errno != EINTR && errno != EAGAIN) {
951 log_error("Failed to read from signalfd: %m");
957 if (sfsi.ssi_signo == SIGWINCH) {
960 /* The window size changed, let's forward that. */
961 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
962 ioctl(master, TIOCSWINSZ, &ws);
971 while ((stdin_readable && in_buffer_full <= 0) ||
972 (master_writable && in_buffer_full > 0) ||
973 (master_readable && out_buffer_full <= 0) ||
974 (stdout_writable && out_buffer_full > 0)) {
976 if (stdin_readable && in_buffer_full < LINE_MAX) {
978 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
981 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
982 stdin_readable = false;
984 log_error("read(): %m");
989 in_buffer_full += (size_t) k;
992 if (master_writable && in_buffer_full > 0) {
994 k = write(master, in_buffer, in_buffer_full);
997 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
998 master_writable = false;
1000 log_error("write(): %m");
1006 assert(in_buffer_full >= (size_t) k);
1007 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1008 in_buffer_full -= k;
1012 if (master_readable && out_buffer_full < LINE_MAX) {
1014 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1017 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1018 master_readable = false;
1020 log_error("read(): %m");
1025 out_buffer_full += (size_t) k;
1028 if (stdout_writable && out_buffer_full > 0) {
1030 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1033 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1034 stdout_writable = false;
1036 log_error("write(): %m");
1042 assert(out_buffer_full >= (size_t) k);
1043 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1044 out_buffer_full -= k;
1052 close_nointr_nofail(ep);
1055 close_nointr_nofail(signal_fd);
1060 int main(int argc, char *argv[]) {
1062 int r = EXIT_FAILURE, k;
1063 char *oldcg = NULL, *newcg = NULL;
1064 char **controller = NULL;
1066 const char *console = NULL;
1067 struct termios saved_attr, raw_attr;
1069 bool saved_attr_valid = false;
1071 int kmsg_socket_pair[2] = { -1, -1 };
1073 log_parse_environment();
1076 r = parse_argv(argc, argv);
1080 if (arg_directory) {
1083 p = path_make_absolute_cwd(arg_directory);
1084 free(arg_directory);
1087 arg_directory = get_current_dir_name();
1089 if (!arg_directory) {
1090 log_error("Failed to determine path");
1094 path_kill_slashes(arg_directory);
1096 if (geteuid() != 0) {
1097 log_error("Need to be root.");
1101 if (sd_booted() <= 0) {
1102 log_error("Not running on a systemd system.");
1106 if (path_equal(arg_directory, "/")) {
1107 log_error("Spawning container on root directory not supported.");
1111 if (is_os_tree(arg_directory) <= 0) {
1112 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1116 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1118 log_error("Failed to determine current cgroup: %s", strerror(-k));
1122 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1123 log_error("Failed to allocate cgroup path.");
1127 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1129 log_error("Failed to create cgroup: %s", strerror(-k));
1133 STRV_FOREACH(controller, arg_controllers) {
1134 k = cg_create_and_attach(*controller, newcg, 0);
1136 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1139 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1141 log_error("Failed to acquire pseudo tty: %m");
1145 console = ptsname(master);
1147 log_error("Failed to determine tty name: %m");
1151 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1153 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1154 ioctl(master, TIOCSWINSZ, &ws);
1156 if (unlockpt(master) < 0) {
1157 log_error("Failed to unlock tty: %m");
1161 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1162 log_error("Failed to get terminal attributes: %m");
1166 saved_attr_valid = true;
1168 raw_attr = saved_attr;
1169 cfmakeraw(&raw_attr);
1170 raw_attr.c_lflag &= ~ECHO;
1172 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1173 log_error("Failed to create kmsg socket pair");
1177 assert_se(sigemptyset(&mask) == 0);
1178 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1179 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1184 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1185 log_error("Failed to set terminal attributes: %m");
1189 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1191 if (errno == EINVAL)
1192 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1194 log_error("clone() failed: %m");
1202 const char *home = NULL;
1203 uid_t uid = (uid_t) -1;
1204 gid_t gid = (gid_t) -1;
1205 const char *envp[] = {
1206 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1207 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1212 NULL, /* container_uuid */
1216 envp[2] = strv_find_prefix(environ, "TERM=");
1218 close_nointr_nofail(master);
1220 close_nointr(STDIN_FILENO);
1221 close_nointr(STDOUT_FILENO);
1222 close_nointr(STDERR_FILENO);
1224 close_all_fds(&kmsg_socket_pair[1], 1);
1226 reset_all_signal_handlers();
1228 assert_se(sigemptyset(&mask) == 0);
1229 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1231 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1232 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1233 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1237 log_error("setsid() failed: %m");
1241 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1242 log_error("PR_SET_PDEATHSIG failed: %m");
1246 /* Mark everything as slave, so that we still
1247 * receive mounts from the real root, but don't
1248 * propagate mounts to the real root. */
1249 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1250 log_error("MS_SLAVE|MS_REC failed: %m");
1254 /* Turn directory into bind mount */
1255 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1256 log_error("Failed to make bind mount.");
1261 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1262 log_error("Failed to make read-only.");
1266 if (mount_all(arg_directory) < 0)
1269 if (copy_devnodes(arg_directory) < 0)
1272 dev_setup(arg_directory);
1274 if (setup_dev_console(arg_directory, console) < 0)
1277 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1280 close_nointr_nofail(kmsg_socket_pair[1]);
1282 if (setup_boot_id(arg_directory) < 0)
1285 if (setup_timezone(arg_directory) < 0)
1288 if (setup_resolv_conf(arg_directory) < 0)
1291 if (setup_journal(arg_directory) < 0)
1294 if (chdir(arg_directory) < 0) {
1295 log_error("chdir(%s) failed: %m", arg_directory);
1299 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1300 log_error("mount(MS_MOVE) failed: %m");
1304 if (chroot(".") < 0) {
1305 log_error("chroot() failed: %m");
1309 if (chdir("/") < 0) {
1310 log_error("chdir() failed: %m");
1318 if (drop_capabilities() < 0) {
1319 log_error("drop_capabilities() failed: %m");
1325 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1326 log_error("get_user_creds() failed: %m");
1330 if (mkdir_parents_label(home, 0775) < 0) {
1331 log_error("mkdir_parents_label() failed: %m");
1335 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1336 log_error("mkdir_safe_label() failed: %m");
1340 if (initgroups((const char*)arg_user, gid) < 0) {
1341 log_error("initgroups() failed: %m");
1345 if (setresgid(gid, gid, gid) < 0) {
1346 log_error("setregid() failed: %m");
1350 if (setresuid(uid, uid, uid) < 0) {
1351 log_error("setreuid() failed: %m");
1356 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1357 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1358 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1364 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1376 /* Automatically search for the init system */
1378 l = 1 + argc - optind;
1379 a = newa(char*, l + 1);
1380 memcpy(a + 1, argv + optind, l * sizeof(char*));
1382 a[0] = (char*) "/usr/lib/systemd/systemd";
1383 execve(a[0], a, (char**) envp);
1385 a[0] = (char*) "/lib/systemd/systemd";
1386 execve(a[0], a, (char**) envp);
1388 a[0] = (char*) "/sbin/init";
1389 execve(a[0], a, (char**) envp);
1390 } else if (argc > optind)
1391 execvpe(argv[optind], argv + optind, (char**) envp);
1393 chdir(home ? home : "/root");
1394 execle("/bin/bash", "-bash", NULL, (char**) envp);
1397 log_error("execv() failed: %m");
1400 _exit(EXIT_FAILURE);
1403 if (process_pty(master, &mask) < 0)
1407 if (saved_attr_valid)
1408 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1410 r = wait_for_terminate(pid, &status);
1416 if (status.si_code == CLD_EXITED) {
1417 if (status.si_status != 0) {
1418 log_error("Container failed with error code %i.", status.si_status);
1419 r = status.si_status;
1423 log_debug("Container exited successfully.");
1425 } else if (status.si_code == CLD_KILLED &&
1426 status.si_status == SIGINT) {
1427 log_info("Container has been shut down.");
1430 } else if (status.si_code == CLD_KILLED &&
1431 status.si_status == SIGHUP) {
1432 log_info("Container is being rebooted.");
1434 } else if (status.si_code == CLD_KILLED ||
1435 status.si_code == CLD_DUMPED) {
1437 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1441 log_error("Container failed due to unknown reason.");
1448 if (saved_attr_valid)
1449 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1452 close_nointr_nofail(master);
1454 close_pipe(kmsg_socket_pair);
1457 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1460 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1462 free(arg_directory);
1463 strv_free(arg_controllers);