1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
59 typedef enum LinkJournal {
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
76 (1ULL << CAP_DAC_OVERRIDE) |
77 (1ULL << CAP_DAC_READ_SEARCH) |
78 (1ULL << CAP_FOWNER) |
79 (1ULL << CAP_FSETID) |
80 (1ULL << CAP_IPC_OWNER) |
83 (1ULL << CAP_LINUX_IMMUTABLE) |
84 (1ULL << CAP_NET_BIND_SERVICE) |
85 (1ULL << CAP_NET_BROADCAST) |
86 (1ULL << CAP_NET_RAW) |
87 (1ULL << CAP_SETGID) |
88 (1ULL << CAP_SETFCAP) |
89 (1ULL << CAP_SETPCAP) |
90 (1ULL << CAP_SETUID) |
91 (1ULL << CAP_SYS_ADMIN) |
92 (1ULL << CAP_SYS_CHROOT) |
93 (1ULL << CAP_SYS_NICE) |
94 (1ULL << CAP_SYS_PTRACE) |
95 (1ULL << CAP_SYS_TTY_CONFIG) |
96 (1ULL << CAP_SYS_RESOURCE) |
97 (1ULL << CAP_SYS_BOOT);
99 static int help(void) {
101 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103 " -h --help Show this help\n"
104 " -D --directory=NAME Root directory for the container\n"
105 " -b --boot Boot up full system (i.e. invoke init)\n"
106 " -u --user=USER Run the command under specified user or uid\n"
107 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
108 " --uuid=UUID Set a specific machine UUID for the container\n"
109 " --private-network Disable network in container\n"
110 " --read-only Mount the root directory read-only\n"
111 " --capability=CAP In addition to the default, retain specified capability\n"
112 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
113 " -j Equivalent to --link-journal=host\n",
114 program_invocation_short_name);
119 static int parse_argv(int argc, char *argv[]) {
122 ARG_PRIVATE_NETWORK = 0x100,
129 static const struct option options[] = {
130 { "help", no_argument, NULL, 'h' },
131 { "directory", required_argument, NULL, 'D' },
132 { "user", required_argument, NULL, 'u' },
133 { "controllers", required_argument, NULL, 'C' },
134 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
135 { "boot", no_argument, NULL, 'b' },
136 { "uuid", required_argument, NULL, ARG_UUID },
137 { "read-only", no_argument, NULL, ARG_READ_ONLY },
138 { "capability", required_argument, NULL, ARG_CAPABILITY },
139 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
148 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158 arg_directory = canonicalize_file_name(optarg);
159 if (!arg_directory) {
160 log_error("Failed to canonicalize root directory.");
168 if (!(arg_user = strdup(optarg))) {
169 log_error("Failed to duplicate user name.");
176 strv_free(arg_controllers);
177 arg_controllers = strv_split(optarg, ",");
178 if (!arg_controllers) {
179 log_error("Failed to split controllers list.");
182 strv_uniq(arg_controllers);
186 case ARG_PRIVATE_NETWORK:
187 arg_private_network = true;
199 arg_read_only = true;
202 case ARG_CAPABILITY: {
206 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
210 t = strndup(word, length);
214 if (cap_from_name(t, &cap) < 0) {
215 log_error("Failed to parse capability %s.", t);
221 arg_retain |= 1ULL << (uint64_t) cap;
228 arg_link_journal = LINK_GUEST;
231 case ARG_LINK_JOURNAL:
232 if (streq(optarg, "auto"))
233 arg_link_journal = LINK_AUTO;
234 else if (streq(optarg, "no"))
235 arg_link_journal = LINK_NO;
236 else if (streq(optarg, "guest"))
237 arg_link_journal = LINK_GUEST;
238 else if (streq(optarg, "host"))
239 arg_link_journal = LINK_HOST;
241 log_error("Failed to parse link journal mode %s", optarg);
251 log_error("Unknown option code %c", c);
259 static int mount_all(const char *dest) {
261 typedef struct MountPoint {
270 static const MountPoint mount_table[] = {
271 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
272 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
277 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
280 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
281 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
287 char _cleanup_free_ *where = NULL;
289 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
292 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
301 t = path_is_mount_point(where, true);
303 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
315 mkdir_p_label(where, 0755);
317 if (mount(mount_table[k].what,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
324 log_error("mount(%s) failed: %m", where);
334 static int setup_timezone(const char *dest) {
339 /* Fix the timezone, if possible */
340 where = strappend(dest, "/etc/localtime");
344 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
349 where = strappend(dest, "/etc/timezone");
353 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
361 static int setup_resolv_conf(const char *dest) {
366 if (arg_private_network)
369 /* Fix resolv.conf, if possible */
370 where = strappend(dest, "/etc/resolv.conf");
374 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
382 static int setup_boot_id(const char *dest) {
383 char *from = NULL, *to = NULL;
390 /* Generate a new randomized boot ID, so that each boot-up of
391 * the container gets a new one */
393 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
399 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
405 r = sd_id128_randomize(&rnd);
407 log_error("Failed to generate random boot id: %s", strerror(-r));
411 snprintf(as_uuid, sizeof(as_uuid),
412 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
413 SD_ID128_FORMAT_VAL(rnd));
414 char_array_0(as_uuid);
416 r = write_one_line_file(from, as_uuid);
418 log_error("Failed to write boot id: %s", strerror(-r));
422 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
423 log_error("Failed to bind mount boot id: %m");
426 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
437 static int copy_devnodes(const char *dest) {
439 static const char devnodes[] =
456 NULSTR_FOREACH(d, devnodes) {
458 char *from = NULL, *to = NULL;
460 asprintf(&from, "/dev/%s", d);
461 asprintf(&to, "%s/dev/%s", dest, d);
464 log_error("Failed to allocate devnode path");
477 if (stat(from, &st) < 0) {
479 if (errno != ENOENT) {
480 log_error("Failed to stat %s: %m", from);
485 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
487 log_error("%s is not a char or block device, cannot copy.", from);
491 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
493 log_error("mknod(%s) failed: %m", dest);
507 static int setup_dev_console(const char *dest, const char *console) {
518 if (stat(console, &st) < 0) {
519 log_error("Failed to stat %s: %m", console);
523 } else if (!S_ISCHR(st.st_mode)) {
524 log_error("/dev/console is not a char device.");
529 r = chmod_and_chown(console, 0600, 0, 0);
531 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
535 if (asprintf(&to, "%s/dev/console", dest) < 0) {
540 /* We need to bind mount the right tty to /dev/console since
541 * ptys can only exist on pts file systems. To have something
542 * to bind mount things on we create a device node first, that
543 * has the right major/minor (note that the major minor
544 * doesn't actually matter here, since we mount it over
547 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
548 log_error("mknod() for /dev/console failed: %m");
553 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
554 log_error("Bind mount for /dev/console failed: %m");
566 static int setup_kmsg(const char *dest, int kmsg_socket) {
567 char *from = NULL, *to = NULL;
571 struct cmsghdr cmsghdr;
572 uint8_t buf[CMSG_SPACE(sizeof(int))];
575 struct cmsghdr *cmsg;
578 assert(kmsg_socket >= 0);
582 /* We create the kmsg FIFO as /dev/kmsg, but immediately
583 * delete it after bind mounting it to /proc/kmsg. While FIFOs
584 * on the reading side behave very similar to /proc/kmsg,
585 * their writing side behaves differently from /dev/kmsg in
586 * that writing blocks when nothing is reading. In order to
587 * avoid any problems with containers deadlocking due to this
588 * we simply make /dev/kmsg unavailable to the container. */
589 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
594 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
599 if (mkfifo(from, 0600) < 0) {
600 log_error("mkfifo() for /dev/kmsg failed: %m");
605 r = chmod_and_chown(from, 0600, 0, 0);
607 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
611 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
612 log_error("Bind mount for /proc/kmsg failed: %m");
617 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
619 log_error("Failed to open fifo: %m");
627 mh.msg_control = &control;
628 mh.msg_controllen = sizeof(control);
630 cmsg = CMSG_FIRSTHDR(&mh);
631 cmsg->cmsg_level = SOL_SOCKET;
632 cmsg->cmsg_type = SCM_RIGHTS;
633 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
634 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
636 mh.msg_controllen = cmsg->cmsg_len;
638 /* Store away the fd in the socket, so that it stays open as
639 * long as we run the child */
640 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
641 close_nointr_nofail(fd);
644 log_error("Failed to send FIFO fd: %m");
649 /* And now make the FIFO unavailable as /dev/kmsg... */
660 static int setup_hostname(void) {
664 hn = path_get_file_name(arg_directory);
670 hostname_cleanup(hn);
673 if (sethostname(hn, strlen(hn)) < 0)
682 static int setup_journal(const char *directory) {
683 sd_id128_t machine_id;
684 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
687 if (arg_link_journal == LINK_NO)
690 p = strappend(directory, "/etc/machine-id");
696 r = read_one_line_file(p, &b);
697 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
701 log_error("Failed to read machine ID: %s", strerror(-r));
706 if (isempty(l) && arg_link_journal == LINK_AUTO) {
711 /* Verify validaty */
712 r = sd_id128_from_string(l, &machine_id);
714 log_error("Failed to parse machine ID: %s", strerror(-r));
719 p = strappend("/var/log/journal/", l);
720 q = strjoin(directory, "/var/log/journal/", l, NULL);
726 if (path_is_mount_point(p, false) > 0 ||
727 path_is_mount_point(q, false) > 0) {
728 if (arg_link_journal != LINK_AUTO) {
729 log_error("Journal already a mount point, refusing.");
738 r = readlink_and_make_absolute(p, &d);
740 if ((arg_link_journal == LINK_GUEST ||
741 arg_link_journal == LINK_AUTO) &&
751 log_error("Failed to remove symlink %s: %m", p);
755 } else if (r == -EINVAL) {
757 if (arg_link_journal == LINK_GUEST &&
760 if (errno == ENOTDIR)
761 log_error("%s already exists and is neither symlink nor directory.", p);
763 log_error("Failed to remove %s: %m", p);
769 } else if (r != -ENOENT) {
770 log_error("readlink(%s) failed: %m", p);
774 if (arg_link_journal == LINK_GUEST) {
776 if (symlink(q, p) < 0) {
777 log_error("Failed to symlink %s to %s: %m", q, p);
788 if (arg_link_journal == LINK_HOST) {
789 r = mkdir_p(p, 0755);
791 log_error("Failed to create %s: %m", p);
795 } else if (access(p, F_OK) < 0) {
800 if (dir_is_empty(q) == 0) {
801 log_error("%s not empty.", q);
806 r = mkdir_p(q, 0755);
808 log_error("Failed to create %s: %m", q);
812 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
813 log_error("Failed to bind mount journal from host into guest: %m");
829 static int drop_capabilities(void) {
830 return capability_bounding_set_drop(~arg_retain, false);
833 static int is_os_tree(const char *path) {
836 /* We use /bin/sh as flag file if something is an OS */
838 if (asprintf(&p, "%s/bin/sh", path) < 0)
844 return r < 0 ? 0 : 1;
847 static int process_pty(int master, sigset_t *mask) {
849 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
850 size_t in_buffer_full = 0, out_buffer_full = 0;
851 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
852 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
853 int ep = -1, signal_fd = -1, r;
855 fd_nonblock(STDIN_FILENO, 1);
856 fd_nonblock(STDOUT_FILENO, 1);
857 fd_nonblock(master, 1);
859 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
861 log_error("signalfd(): %m");
866 ep = epoll_create1(EPOLL_CLOEXEC);
868 log_error("Failed to create epoll: %m");
874 stdin_ev.events = EPOLLIN|EPOLLET;
875 stdin_ev.data.fd = STDIN_FILENO;
878 stdout_ev.events = EPOLLOUT|EPOLLET;
879 stdout_ev.data.fd = STDOUT_FILENO;
882 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
883 master_ev.data.fd = master;
886 signal_ev.events = EPOLLIN;
887 signal_ev.data.fd = signal_fd;
889 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
890 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
891 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
892 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
893 log_error("Failed to regiser fds in epoll: %m");
899 struct epoll_event ev[16];
903 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
906 if (errno == EINTR || errno == EAGAIN)
909 log_error("epoll_wait(): %m");
916 for (i = 0; i < nfds; i++) {
917 if (ev[i].data.fd == STDIN_FILENO) {
919 if (ev[i].events & (EPOLLIN|EPOLLHUP))
920 stdin_readable = true;
922 } else if (ev[i].data.fd == STDOUT_FILENO) {
924 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
925 stdout_writable = true;
927 } else if (ev[i].data.fd == master) {
929 if (ev[i].events & (EPOLLIN|EPOLLHUP))
930 master_readable = true;
932 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
933 master_writable = true;
935 } else if (ev[i].data.fd == signal_fd) {
936 struct signalfd_siginfo sfsi;
939 n = read(signal_fd, &sfsi, sizeof(sfsi));
940 if (n != sizeof(sfsi)) {
943 log_error("Failed to read from signalfd: invalid block size");
948 if (errno != EINTR && errno != EAGAIN) {
949 log_error("Failed to read from signalfd: %m");
955 if (sfsi.ssi_signo == SIGWINCH) {
958 /* The window size changed, let's forward that. */
959 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
960 ioctl(master, TIOCSWINSZ, &ws);
969 while ((stdin_readable && in_buffer_full <= 0) ||
970 (master_writable && in_buffer_full > 0) ||
971 (master_readable && out_buffer_full <= 0) ||
972 (stdout_writable && out_buffer_full > 0)) {
974 if (stdin_readable && in_buffer_full < LINE_MAX) {
976 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
979 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
980 stdin_readable = false;
982 log_error("read(): %m");
987 in_buffer_full += (size_t) k;
990 if (master_writable && in_buffer_full > 0) {
992 k = write(master, in_buffer, in_buffer_full);
995 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
996 master_writable = false;
998 log_error("write(): %m");
1004 assert(in_buffer_full >= (size_t) k);
1005 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1006 in_buffer_full -= k;
1010 if (master_readable && out_buffer_full < LINE_MAX) {
1012 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1015 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1016 master_readable = false;
1018 log_error("read(): %m");
1023 out_buffer_full += (size_t) k;
1026 if (stdout_writable && out_buffer_full > 0) {
1028 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1031 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1032 stdout_writable = false;
1034 log_error("write(): %m");
1040 assert(out_buffer_full >= (size_t) k);
1041 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1042 out_buffer_full -= k;
1050 close_nointr_nofail(ep);
1053 close_nointr_nofail(signal_fd);
1058 int main(int argc, char *argv[]) {
1060 int r = EXIT_FAILURE, k;
1061 char *oldcg = NULL, *newcg = NULL;
1062 char **controller = NULL;
1064 const char *console = NULL;
1065 struct termios saved_attr, raw_attr;
1067 bool saved_attr_valid = false;
1069 int kmsg_socket_pair[2] = { -1, -1 };
1071 log_parse_environment();
1074 r = parse_argv(argc, argv);
1078 if (arg_directory) {
1081 p = path_make_absolute_cwd(arg_directory);
1082 free(arg_directory);
1085 arg_directory = get_current_dir_name();
1087 if (!arg_directory) {
1088 log_error("Failed to determine path");
1092 path_kill_slashes(arg_directory);
1094 if (geteuid() != 0) {
1095 log_error("Need to be root.");
1099 if (sd_booted() <= 0) {
1100 log_error("Not running on a systemd system.");
1104 if (path_equal(arg_directory, "/")) {
1105 log_error("Spawning container on root directory not supported.");
1109 if (is_os_tree(arg_directory) <= 0) {
1110 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1114 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1116 log_error("Failed to determine current cgroup: %s", strerror(-k));
1120 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1121 log_error("Failed to allocate cgroup path.");
1125 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1127 log_error("Failed to create cgroup: %s", strerror(-k));
1131 STRV_FOREACH(controller, arg_controllers) {
1132 k = cg_create_and_attach(*controller, newcg, 0);
1134 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1137 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1139 log_error("Failed to acquire pseudo tty: %m");
1143 console = ptsname(master);
1145 log_error("Failed to determine tty name: %m");
1149 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1151 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1152 ioctl(master, TIOCSWINSZ, &ws);
1154 if (unlockpt(master) < 0) {
1155 log_error("Failed to unlock tty: %m");
1159 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1160 log_error("Failed to get terminal attributes: %m");
1164 saved_attr_valid = true;
1166 raw_attr = saved_attr;
1167 cfmakeraw(&raw_attr);
1168 raw_attr.c_lflag &= ~ECHO;
1170 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1171 log_error("Failed to create kmsg socket pair");
1175 assert_se(sigemptyset(&mask) == 0);
1176 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1177 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1182 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1183 log_error("Failed to set terminal attributes: %m");
1187 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1189 if (errno == EINVAL)
1190 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1192 log_error("clone() failed: %m");
1200 const char *home = NULL;
1201 uid_t uid = (uid_t) -1;
1202 gid_t gid = (gid_t) -1;
1203 const char *envp[] = {
1204 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1205 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1210 NULL, /* container_uuid */
1214 envp[2] = strv_find_prefix(environ, "TERM=");
1216 close_nointr_nofail(master);
1218 close_nointr(STDIN_FILENO);
1219 close_nointr(STDOUT_FILENO);
1220 close_nointr(STDERR_FILENO);
1222 close_all_fds(&kmsg_socket_pair[1], 1);
1224 reset_all_signal_handlers();
1226 assert_se(sigemptyset(&mask) == 0);
1227 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1229 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1230 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1231 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1235 log_error("setsid() failed: %m");
1239 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1240 log_error("PR_SET_PDEATHSIG failed: %m");
1244 /* Mark everything as slave, so that we still
1245 * receive mounts from the real root, but don't
1246 * propagate mounts to the real root. */
1247 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1248 log_error("MS_SLAVE|MS_REC failed: %m");
1252 /* Turn directory into bind mount */
1253 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1254 log_error("Failed to make bind mount.");
1259 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1260 log_error("Failed to make read-only.");
1264 if (mount_all(arg_directory) < 0)
1267 if (copy_devnodes(arg_directory) < 0)
1270 dev_setup(arg_directory);
1272 if (setup_dev_console(arg_directory, console) < 0)
1275 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1278 close_nointr_nofail(kmsg_socket_pair[1]);
1280 if (setup_boot_id(arg_directory) < 0)
1283 if (setup_timezone(arg_directory) < 0)
1286 if (setup_resolv_conf(arg_directory) < 0)
1289 if (setup_journal(arg_directory) < 0)
1292 if (chdir(arg_directory) < 0) {
1293 log_error("chdir(%s) failed: %m", arg_directory);
1297 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1298 log_error("mount(MS_MOVE) failed: %m");
1302 if (chroot(".") < 0) {
1303 log_error("chroot() failed: %m");
1307 if (chdir("/") < 0) {
1308 log_error("chdir() failed: %m");
1316 if (drop_capabilities() < 0) {
1317 log_error("drop_capabilities() failed: %m");
1323 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1324 log_error("get_user_creds() failed: %m");
1328 if (mkdir_parents_label(home, 0775) < 0) {
1329 log_error("mkdir_parents_label() failed: %m");
1333 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1334 log_error("mkdir_safe_label() failed: %m");
1338 if (initgroups((const char*)arg_user, gid) < 0) {
1339 log_error("initgroups() failed: %m");
1343 if (setresgid(gid, gid, gid) < 0) {
1344 log_error("setregid() failed: %m");
1348 if (setresuid(uid, uid, uid) < 0) {
1349 log_error("setreuid() failed: %m");
1354 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1355 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1356 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1362 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1374 /* Automatically search for the init system */
1376 l = 1 + argc - optind;
1377 a = newa(char*, l + 1);
1378 memcpy(a + 1, argv + optind, l * sizeof(char*));
1380 a[0] = (char*) "/usr/lib/systemd/systemd";
1381 execve(a[0], a, (char**) envp);
1383 a[0] = (char*) "/lib/systemd/systemd";
1384 execve(a[0], a, (char**) envp);
1386 a[0] = (char*) "/sbin/init";
1387 execve(a[0], a, (char**) envp);
1388 } else if (argc > optind)
1389 execvpe(argv[optind], argv + optind, (char**) envp);
1391 chdir(home ? home : "/root");
1392 execle("/bin/bash", "-bash", NULL, (char**) envp);
1395 log_error("execv() failed: %m");
1398 _exit(EXIT_FAILURE);
1401 if (process_pty(master, &mask) < 0)
1405 if (saved_attr_valid)
1406 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1408 r = wait_for_terminate(pid, &status);
1414 if (status.si_code == CLD_EXITED) {
1415 if (status.si_status != 0) {
1416 log_error("Container failed with error code %i.", status.si_status);
1417 r = status.si_status;
1421 log_debug("Container exited successfully.");
1423 } else if (status.si_code == CLD_KILLED &&
1424 status.si_status == SIGINT) {
1425 log_info("Container has been shut down.");
1428 } else if (status.si_code == CLD_KILLED &&
1429 status.si_status == SIGHUP) {
1430 log_info("Container is being rebooted.");
1432 } else if (status.si_code == CLD_KILLED ||
1433 status.si_code == CLD_DUMPED) {
1435 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1439 log_error("Container failed due to unknown reason.");
1446 if (saved_attr_valid)
1447 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1450 close_nointr_nofail(master);
1452 close_pipe(kmsg_socket_pair);
1455 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1458 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1460 free(arg_directory);
1461 strv_free(arg_controllers);