1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
67 typedef enum LinkJournal {
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static bool arg_private_network = false;
79 static bool arg_read_only = false;
80 static bool arg_boot = false;
81 static LinkJournal arg_link_journal = LINK_AUTO;
82 static uint64_t arg_retain =
84 (1ULL << CAP_DAC_OVERRIDE) |
85 (1ULL << CAP_DAC_READ_SEARCH) |
86 (1ULL << CAP_FOWNER) |
87 (1ULL << CAP_FSETID) |
88 (1ULL << CAP_IPC_OWNER) |
91 (1ULL << CAP_LINUX_IMMUTABLE) |
92 (1ULL << CAP_NET_BIND_SERVICE) |
93 (1ULL << CAP_NET_BROADCAST) |
94 (1ULL << CAP_NET_RAW) |
95 (1ULL << CAP_SETGID) |
96 (1ULL << CAP_SETFCAP) |
97 (1ULL << CAP_SETPCAP) |
98 (1ULL << CAP_SETUID) |
99 (1ULL << CAP_SYS_ADMIN) |
100 (1ULL << CAP_SYS_CHROOT) |
101 (1ULL << CAP_SYS_NICE) |
102 (1ULL << CAP_SYS_PTRACE) |
103 (1ULL << CAP_SYS_TTY_CONFIG) |
104 (1ULL << CAP_SYS_RESOURCE) |
105 (1ULL << CAP_SYS_BOOT) |
106 (1ULL << CAP_AUDIT_WRITE) |
107 (1ULL << CAP_AUDIT_CONTROL);
108 static char **arg_bind = NULL;
109 static char **arg_bind_ro = NULL;
111 static int help(void) {
113 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
114 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
115 " -h --help Show this help\n"
116 " --version Print version string\n"
117 " -D --directory=NAME Root directory for the container\n"
118 " -b --boot Boot up full system (i.e. invoke init)\n"
119 " -u --user=USER Run the command under specified user or uid\n"
120 " -C --controllers=LIST Put the container in specified comma-separated\n"
121 " cgroup hierarchies\n"
122 " --uuid=UUID Set a specific machine UUID for the container\n"
123 " --private-network Disable network in container\n"
124 " --read-only Mount the root directory read-only\n"
125 " --capability=CAP In addition to the default, retain specified\n"
127 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
128 " -j Equivalent to --link-journal=host\n"
129 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
131 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
132 program_invocation_short_name);
137 static int parse_argv(int argc, char *argv[]) {
150 static const struct option options[] = {
151 { "help", no_argument, NULL, 'h' },
152 { "version", no_argument, NULL, ARG_VERSION },
153 { "directory", required_argument, NULL, 'D' },
154 { "user", required_argument, NULL, 'u' },
155 { "controllers", required_argument, NULL, 'C' },
156 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
157 { "boot", no_argument, NULL, 'b' },
158 { "uuid", required_argument, NULL, ARG_UUID },
159 { "read-only", no_argument, NULL, ARG_READ_ONLY },
160 { "capability", required_argument, NULL, ARG_CAPABILITY },
161 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
162 { "bind", required_argument, NULL, ARG_BIND },
163 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
172 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
181 puts(PACKAGE_STRING);
182 puts(SYSTEMD_FEATURES);
187 arg_directory = canonicalize_file_name(optarg);
188 if (!arg_directory) {
189 log_error("Failed to canonicalize root directory.");
197 if (!(arg_user = strdup(optarg))) {
198 log_error("Failed to duplicate user name.");
205 strv_free(arg_controllers);
206 arg_controllers = strv_split(optarg, ",");
207 if (!arg_controllers) {
208 log_error("Failed to split controllers list.");
211 strv_uniq(arg_controllers);
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
228 arg_read_only = true;
231 case ARG_CAPABILITY: {
235 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
239 t = strndup(word, length);
243 if (cap_from_name(t, &cap) < 0) {
244 log_error("Failed to parse capability %s.", t);
250 arg_retain |= 1ULL << (uint64_t) cap;
257 arg_link_journal = LINK_GUEST;
260 case ARG_LINK_JOURNAL:
261 if (streq(optarg, "auto"))
262 arg_link_journal = LINK_AUTO;
263 else if (streq(optarg, "no"))
264 arg_link_journal = LINK_NO;
265 else if (streq(optarg, "guest"))
266 arg_link_journal = LINK_GUEST;
267 else if (streq(optarg, "host"))
268 arg_link_journal = LINK_HOST;
270 log_error("Failed to parse link journal mode %s", optarg);
278 _cleanup_free_ char *a = NULL, *b = NULL;
283 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
285 e = strchr(optarg, ':');
287 a = strndup(optarg, e - optarg);
297 if (!path_is_absolute(a) || !path_is_absolute(b)) {
298 log_error("Invalid bind mount specification: %s", optarg);
302 r = strv_extend(x, a);
306 r = strv_extend(x, b);
317 log_error("Unknown option code %c", c);
325 static int mount_all(const char *dest) {
327 typedef struct MountPoint {
336 static const MountPoint mount_table[] = {
337 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
338 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
339 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
340 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
341 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
342 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
343 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
344 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
346 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
347 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
354 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
355 char _cleanup_free_ *where = NULL;
358 where = strjoin(dest, "/", mount_table[k].where, NULL);
362 t = path_is_mount_point(where, true);
364 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
372 /* Skip this entry if it is not a remount. */
373 if (mount_table[k].what && t > 0)
376 mkdir_p(where, 0755);
378 if (mount(mount_table[k].what,
381 mount_table[k].flags,
382 mount_table[k].options) < 0 &&
383 mount_table[k].fatal) {
385 log_error("mount(%s) failed: %m", where);
395 static int mount_binds(const char *dest, char **l, unsigned long flags) {
398 STRV_FOREACH_PAIR(x, y, l) {
399 _cleanup_free_ char *where = NULL;
401 where = strjoin(dest, "/", *y, NULL);
405 mkdir_p_label(where, 0755);
407 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
408 log_error("mount(%s) failed: %m", where);
412 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
413 log_error("mount(%s) failed: %m", where);
421 static int setup_timezone(const char *dest) {
422 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
428 /* Fix the timezone, if possible */
429 r = readlink_malloc("/etc/localtime", &p);
431 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
435 z = path_startswith(p, "../usr/share/zoneinfo/");
437 z = path_startswith(p, "/usr/share/zoneinfo/");
439 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
443 where = strappend(dest, "/etc/localtime");
447 r = readlink_malloc(where, &q);
449 y = path_startswith(q, "../usr/share/zoneinfo/");
451 y = path_startswith(q, "/usr/share/zoneinfo/");
454 /* Already pointing to the right place? Then do nothing .. */
455 if (y && streq(y, z))
459 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
463 if (access(check, F_OK) < 0) {
464 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
468 what = strappend("../usr/share/zoneinfo/", z);
473 if (symlink(what, where) < 0) {
474 log_error("Failed to correct timezone of container: %m");
481 static int setup_resolv_conf(const char *dest) {
486 if (arg_private_network)
489 /* Fix resolv.conf, if possible */
490 where = strappend(dest, "/etc/resolv.conf");
494 /* We don't really care for the results of this really. If it
495 * fails, it fails, but meh... */
496 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
497 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
504 static int setup_boot_id(const char *dest) {
505 char _cleanup_free_ *from = NULL, *to = NULL;
512 /* Generate a new randomized boot ID, so that each boot-up of
513 * the container gets a new one */
515 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
516 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
520 r = sd_id128_randomize(&rnd);
522 log_error("Failed to generate random boot id: %s", strerror(-r));
526 snprintf(as_uuid, sizeof(as_uuid),
527 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
528 SD_ID128_FORMAT_VAL(rnd));
529 char_array_0(as_uuid);
531 r = write_one_line_file(from, as_uuid);
533 log_error("Failed to write boot id: %s", strerror(-r));
537 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
538 log_error("Failed to bind mount boot id: %m");
541 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
547 static int copy_devnodes(const char *dest) {
549 static const char devnodes[] =
559 mode_t _cleanup_umask_ u;
565 NULSTR_FOREACH(d, devnodes) {
567 char _cleanup_free_ *from = NULL, *to = NULL;
569 asprintf(&from, "/dev/%s", d);
570 asprintf(&to, "%s/dev/%s", dest, d);
581 if (stat(from, &st) < 0) {
583 if (errno != ENOENT) {
584 log_error("Failed to stat %s: %m", from);
589 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
591 log_error("%s is not a char or block device, cannot copy", from);
595 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
597 log_error("mknod(%s) failed: %m", dest);
606 static int setup_ptmx(const char *dest) {
607 _cleanup_free_ char *p = NULL;
609 p = strappend(dest, "/dev/ptmx");
613 if (symlink("pts/ptmx", p) < 0) {
614 log_error("Failed to create /dev/ptmx symlink: %m");
621 static int setup_dev_console(const char *dest, const char *console) {
623 char _cleanup_free_ *to = NULL;
625 mode_t _cleanup_umask_ u;
632 if (stat(console, &st) < 0) {
633 log_error("Failed to stat %s: %m", console);
636 } else if (!S_ISCHR(st.st_mode)) {
637 log_error("/dev/console is not a char device");
641 r = chmod_and_chown(console, 0600, 0, 0);
643 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
647 if (asprintf(&to, "%s/dev/console", dest) < 0)
650 /* We need to bind mount the right tty to /dev/console since
651 * ptys can only exist on pts file systems. To have something
652 * to bind mount things on we create a device node first, that
653 * has the right major/minor (note that the major minor
654 * doesn't actually matter here, since we mount it over
657 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
658 log_error("mknod() for /dev/console failed: %m");
662 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
663 log_error("Bind mount for /dev/console failed: %m");
670 static int setup_kmsg(const char *dest, int kmsg_socket) {
671 char _cleanup_free_ *from = NULL, *to = NULL;
673 mode_t _cleanup_umask_ u;
675 struct cmsghdr cmsghdr;
676 uint8_t buf[CMSG_SPACE(sizeof(int))];
679 struct cmsghdr *cmsg;
682 assert(kmsg_socket >= 0);
686 /* We create the kmsg FIFO as /dev/kmsg, but immediately
687 * delete it after bind mounting it to /proc/kmsg. While FIFOs
688 * on the reading side behave very similar to /proc/kmsg,
689 * their writing side behaves differently from /dev/kmsg in
690 * that writing blocks when nothing is reading. In order to
691 * avoid any problems with containers deadlocking due to this
692 * we simply make /dev/kmsg unavailable to the container. */
693 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
694 asprintf(&to, "%s/proc/kmsg", dest) < 0)
697 if (mkfifo(from, 0600) < 0) {
698 log_error("mkfifo() for /dev/kmsg failed: %m");
702 r = chmod_and_chown(from, 0600, 0, 0);
704 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
708 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
709 log_error("Bind mount for /proc/kmsg failed: %m");
713 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
715 log_error("Failed to open fifo: %m");
722 mh.msg_control = &control;
723 mh.msg_controllen = sizeof(control);
725 cmsg = CMSG_FIRSTHDR(&mh);
726 cmsg->cmsg_level = SOL_SOCKET;
727 cmsg->cmsg_type = SCM_RIGHTS;
728 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
729 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
731 mh.msg_controllen = cmsg->cmsg_len;
733 /* Store away the fd in the socket, so that it stays open as
734 * long as we run the child */
735 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
736 close_nointr_nofail(fd);
739 log_error("Failed to send FIFO fd: %m");
743 /* And now make the FIFO unavailable as /dev/kmsg... */
748 static int setup_hostname(void) {
752 hn = path_get_file_name(arg_directory);
758 hostname_cleanup(hn);
761 if (sethostname(hn, strlen(hn)) < 0)
770 static int setup_journal(const char *directory) {
771 sd_id128_t machine_id;
772 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
776 if (arg_link_journal == LINK_NO)
779 p = strappend(directory, "/etc/machine-id");
783 r = read_one_line_file(p, &b);
784 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
787 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
792 if (isempty(id) && arg_link_journal == LINK_AUTO)
795 /* Verify validity */
796 r = sd_id128_from_string(id, &machine_id);
798 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
803 p = strappend("/var/log/journal/", id);
804 q = strjoin(directory, "/var/log/journal/", id, NULL);
808 if (path_is_mount_point(p, false) > 0) {
809 if (arg_link_journal != LINK_AUTO) {
810 log_error("%s: already a mount point, refusing to use for journal", p);
817 if (path_is_mount_point(q, false) > 0) {
818 if (arg_link_journal != LINK_AUTO) {
819 log_error("%s: already a mount point, refusing to use for journal", q);
826 r = readlink_and_make_absolute(p, &d);
828 if ((arg_link_journal == LINK_GUEST ||
829 arg_link_journal == LINK_AUTO) &&
832 r = mkdir_p(q, 0755);
834 log_warning("failed to create directory %s: %m", q);
839 log_error("Failed to remove symlink %s: %m", p);
842 } else if (r == -EINVAL) {
844 if (arg_link_journal == LINK_GUEST &&
847 if (errno == ENOTDIR) {
848 log_error("%s already exists and is neither a symlink nor a directory", p);
851 log_error("Failed to remove %s: %m", p);
855 } else if (r != -ENOENT) {
856 log_error("readlink(%s) failed: %m", p);
860 if (arg_link_journal == LINK_GUEST) {
862 if (symlink(q, p) < 0) {
863 log_error("Failed to symlink %s to %s: %m", q, p);
867 r = mkdir_p(q, 0755);
869 log_warning("failed to create directory %s: %m", q);
873 if (arg_link_journal == LINK_HOST) {
874 r = mkdir_p(p, 0755);
876 log_error("Failed to create %s: %m", p);
880 } else if (access(p, F_OK) < 0)
883 if (dir_is_empty(q) == 0) {
884 log_error("%s not empty.", q);
888 r = mkdir_p(q, 0755);
890 log_error("Failed to create %s: %m", q);
894 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
895 log_error("Failed to bind mount journal from host into guest: %m");
902 static int drop_capabilities(void) {
903 return capability_bounding_set_drop(~arg_retain, false);
906 static int is_os_tree(const char *path) {
909 /* We use /bin/sh as flag file if something is an OS */
911 if (asprintf(&p, "%s/bin/sh", path) < 0)
917 return r < 0 ? 0 : 1;
920 static int process_pty(int master, pid_t pid, sigset_t *mask) {
922 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
923 size_t in_buffer_full = 0, out_buffer_full = 0;
924 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
925 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
926 int ep = -1, signal_fd = -1, r;
927 bool tried_orderly_shutdown = false;
933 fd_nonblock(STDIN_FILENO, 1);
934 fd_nonblock(STDOUT_FILENO, 1);
935 fd_nonblock(master, 1);
937 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
939 log_error("signalfd(): %m");
944 ep = epoll_create1(EPOLL_CLOEXEC);
946 log_error("Failed to create epoll: %m");
951 /* We read from STDIN only if this is actually a TTY,
952 * otherwise we assume non-interactivity. */
953 if (isatty(STDIN_FILENO)) {
955 stdin_ev.events = EPOLLIN|EPOLLET;
956 stdin_ev.data.fd = STDIN_FILENO;
958 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
959 log_error("Failed to register STDIN in epoll: %m");
966 stdout_ev.events = EPOLLOUT|EPOLLET;
967 stdout_ev.data.fd = STDOUT_FILENO;
970 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
971 master_ev.data.fd = master;
974 signal_ev.events = EPOLLIN;
975 signal_ev.data.fd = signal_fd;
977 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
978 if (errno != EPERM) {
979 log_error("Failed to register stdout in epoll: %m");
983 /* stdout without epoll support. Likely redirected to regular file. */
984 stdout_writable = true;
987 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
988 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
989 log_error("Failed to register fds in epoll: %m");
995 struct epoll_event ev[16];
999 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1002 if (errno == EINTR || errno == EAGAIN)
1005 log_error("epoll_wait(): %m");
1012 for (i = 0; i < nfds; i++) {
1013 if (ev[i].data.fd == STDIN_FILENO) {
1015 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1016 stdin_readable = true;
1018 } else if (ev[i].data.fd == STDOUT_FILENO) {
1020 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1021 stdout_writable = true;
1023 } else if (ev[i].data.fd == master) {
1025 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1026 master_readable = true;
1028 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1029 master_writable = true;
1031 } else if (ev[i].data.fd == signal_fd) {
1032 struct signalfd_siginfo sfsi;
1035 n = read(signal_fd, &sfsi, sizeof(sfsi));
1036 if (n != sizeof(sfsi)) {
1039 log_error("Failed to read from signalfd: invalid block size");
1044 if (errno != EINTR && errno != EAGAIN) {
1045 log_error("Failed to read from signalfd: %m");
1051 if (sfsi.ssi_signo == SIGWINCH) {
1054 /* The window size changed, let's forward that. */
1055 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1056 ioctl(master, TIOCSWINSZ, &ws);
1057 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1059 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1061 /* This only works for systemd... */
1062 tried_orderly_shutdown = true;
1063 kill(pid, SIGRTMIN+3);
1073 while ((stdin_readable && in_buffer_full <= 0) ||
1074 (master_writable && in_buffer_full > 0) ||
1075 (master_readable && out_buffer_full <= 0) ||
1076 (stdout_writable && out_buffer_full > 0)) {
1078 if (stdin_readable && in_buffer_full < LINE_MAX) {
1080 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1083 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1084 stdin_readable = false;
1086 log_error("read(): %m");
1091 in_buffer_full += (size_t) k;
1094 if (master_writable && in_buffer_full > 0) {
1096 k = write(master, in_buffer, in_buffer_full);
1099 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1100 master_writable = false;
1102 log_error("write(): %m");
1108 assert(in_buffer_full >= (size_t) k);
1109 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1110 in_buffer_full -= k;
1114 if (master_readable && out_buffer_full < LINE_MAX) {
1116 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1119 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1120 master_readable = false;
1122 log_error("read(): %m");
1127 out_buffer_full += (size_t) k;
1130 if (stdout_writable && out_buffer_full > 0) {
1132 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1135 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1136 stdout_writable = false;
1138 log_error("write(): %m");
1144 assert(out_buffer_full >= (size_t) k);
1145 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1146 out_buffer_full -= k;
1154 close_nointr_nofail(ep);
1157 close_nointr_nofail(signal_fd);
1162 int main(int argc, char *argv[]) {
1164 int r = EXIT_FAILURE, k;
1165 char *oldcg = NULL, *newcg = NULL;
1166 char **controller = NULL;
1167 int master = -1, n_fd_passed;
1168 const char *console = NULL;
1169 struct termios saved_attr, raw_attr;
1171 bool saved_attr_valid = false;
1173 int kmsg_socket_pair[2] = { -1, -1 };
1176 log_parse_environment();
1179 r = parse_argv(argc, argv);
1183 if (arg_directory) {
1186 p = path_make_absolute_cwd(arg_directory);
1187 free(arg_directory);
1190 arg_directory = get_current_dir_name();
1192 if (!arg_directory) {
1193 log_error("Failed to determine path");
1197 path_kill_slashes(arg_directory);
1199 if (geteuid() != 0) {
1200 log_error("Need to be root.");
1204 if (sd_booted() <= 0) {
1205 log_error("Not running on a systemd system.");
1209 if (path_equal(arg_directory, "/")) {
1210 log_error("Spawning container on root directory not supported.");
1214 if (is_os_tree(arg_directory) <= 0) {
1215 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1220 n_fd_passed = sd_listen_fds(false);
1221 if (n_fd_passed > 0) {
1222 k = fdset_new_listen_fds(&fds, false);
1224 log_error("Failed to collect file descriptors: %s", strerror(-k));
1228 fdset_close_others(fds);
1231 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1233 log_error("Failed to determine current cgroup: %s", strerror(-k));
1237 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1238 log_error("Failed to allocate cgroup path.");
1242 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1244 log_error("Failed to create cgroup: %s", strerror(-k));
1248 STRV_FOREACH(controller, arg_controllers) {
1249 k = cg_create_and_attach(*controller, newcg, 0);
1251 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1254 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1256 log_error("Failed to acquire pseudo tty: %m");
1260 console = ptsname(master);
1262 log_error("Failed to determine tty name: %m");
1266 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1268 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1269 ioctl(master, TIOCSWINSZ, &ws);
1271 if (unlockpt(master) < 0) {
1272 log_error("Failed to unlock tty: %m");
1276 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1277 saved_attr_valid = true;
1279 raw_attr = saved_attr;
1280 cfmakeraw(&raw_attr);
1281 raw_attr.c_lflag &= ~ECHO;
1284 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1285 log_error("Failed to create kmsg socket pair");
1289 assert_se(sigemptyset(&mask) == 0);
1290 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1291 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1297 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1298 log_error("pipe2(): %m");
1302 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1304 if (errno == EINVAL)
1305 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1307 log_error("clone() failed: %m");
1314 const char *home = NULL;
1315 uid_t uid = (uid_t) -1;
1316 gid_t gid = (gid_t) -1;
1318 const char *envp[] = {
1319 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1320 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1325 NULL, /* container_uuid */
1326 NULL, /* LISTEN_FDS */
1327 NULL, /* LISTEN_PID */
1331 envp[n_env] = strv_find_prefix(environ, "TERM=");
1335 close_nointr_nofail(pipefd[1]);
1336 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1337 close_nointr_nofail(pipefd[0]);
1339 close_nointr_nofail(master);
1342 if (saved_attr_valid) {
1343 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1344 log_error("Failed to set terminal attributes: %m");
1349 close_nointr(STDIN_FILENO);
1350 close_nointr(STDOUT_FILENO);
1351 close_nointr(STDERR_FILENO);
1353 close_nointr_nofail(kmsg_socket_pair[0]);
1354 kmsg_socket_pair[0] = -1;
1356 reset_all_signal_handlers();
1358 assert_se(sigemptyset(&mask) == 0);
1359 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1361 k = open_terminal(console, O_RDWR);
1362 if (k != STDIN_FILENO) {
1364 close_nointr_nofail(k);
1368 log_error("Failed to open console: %s", strerror(-k));
1372 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1373 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1374 log_error("Failed to duplicate console: %m");
1379 log_error("setsid() failed: %m");
1383 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1384 log_error("PR_SET_PDEATHSIG failed: %m");
1388 /* Mark everything as slave, so that we still
1389 * receive mounts from the real root, but don't
1390 * propagate mounts to the real root. */
1391 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1392 log_error("MS_SLAVE|MS_REC failed: %m");
1396 /* Turn directory into bind mount */
1397 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1398 log_error("Failed to make bind mount.");
1403 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1404 log_error("Failed to make read-only.");
1408 if (mount_all(arg_directory) < 0)
1411 if (copy_devnodes(arg_directory) < 0)
1414 if (setup_ptmx(arg_directory) < 0)
1417 dev_setup(arg_directory);
1419 if (setup_dev_console(arg_directory, console) < 0)
1422 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1425 close_nointr_nofail(kmsg_socket_pair[1]);
1426 kmsg_socket_pair[1] = -1;
1428 if (setup_boot_id(arg_directory) < 0)
1431 if (setup_timezone(arg_directory) < 0)
1434 if (setup_resolv_conf(arg_directory) < 0)
1437 if (setup_journal(arg_directory) < 0)
1440 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1443 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1446 if (chdir(arg_directory) < 0) {
1447 log_error("chdir(%s) failed: %m", arg_directory);
1451 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1452 log_error("mount(MS_MOVE) failed: %m");
1456 if (chroot(".") < 0) {
1457 log_error("chroot() failed: %m");
1461 if (chdir("/") < 0) {
1462 log_error("chdir() failed: %m");
1470 if (drop_capabilities() < 0) {
1471 log_error("drop_capabilities() failed: %m");
1477 /* Note that this resolves user names
1478 * inside the container, and hence
1479 * accesses the NSS modules from the
1480 * container and not the host. This is
1483 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1484 log_error("get_user_creds() failed: %m");
1488 if (mkdir_parents_label(home, 0775) < 0) {
1489 log_error("mkdir_parents_label() failed: %m");
1493 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1494 log_error("mkdir_safe_label() failed: %m");
1498 if (initgroups((const char*)arg_user, gid) < 0) {
1499 log_error("initgroups() failed: %m");
1503 if (setresgid(gid, gid, gid) < 0) {
1504 log_error("setregid() failed: %m");
1508 if (setresuid(uid, uid, uid) < 0) {
1509 log_error("setreuid() failed: %m");
1513 /* Reset everything fully to 0, just in case */
1515 if (setgroups(0, NULL) < 0) {
1516 log_error("setgroups() failed: %m");
1520 if (setresgid(0, 0, 0) < 0) {
1521 log_error("setregid() failed: %m");
1525 if (setresuid(0, 0, 0) < 0) {
1526 log_error("setreuid() failed: %m");
1531 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1532 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1533 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1539 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1545 if (fdset_size(fds) > 0) {
1546 k = fdset_cloexec(fds, false);
1548 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1552 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1553 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1565 /* Automatically search for the init system */
1567 l = 1 + argc - optind;
1568 a = newa(char*, l + 1);
1569 memcpy(a + 1, argv + optind, l * sizeof(char*));
1571 a[0] = (char*) "/usr/lib/systemd/systemd";
1572 execve(a[0], a, (char**) envp);
1574 a[0] = (char*) "/lib/systemd/systemd";
1575 execve(a[0], a, (char**) envp);
1577 a[0] = (char*) "/sbin/init";
1578 execve(a[0], a, (char**) envp);
1579 } else if (argc > optind)
1580 execvpe(argv[optind], argv + optind, (char**) envp);
1582 chdir(home ? home : "/root");
1583 execle("/bin/bash", "-bash", NULL, (char**) envp);
1586 log_error("execv() failed: %m");
1589 _exit(EXIT_FAILURE);
1592 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1593 close_nointr_nofail(pipefd[0]);
1594 close_nointr_nofail(pipefd[1]);
1599 if (process_pty(master, pid, &mask) < 0)
1602 if (saved_attr_valid)
1603 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1605 r = wait_for_terminate(pid, &status);
1611 if (status.si_code == CLD_EXITED) {
1612 if (status.si_status != 0) {
1613 log_error("Container failed with error code %i.", status.si_status);
1614 r = status.si_status;
1618 log_debug("Container exited successfully.");
1620 } else if (status.si_code == CLD_KILLED &&
1621 status.si_status == SIGINT) {
1622 log_info("Container has been shut down.");
1625 } else if (status.si_code == CLD_KILLED &&
1626 status.si_status == SIGHUP) {
1627 log_info("Container is being rebooted.");
1629 } else if (status.si_code == CLD_KILLED ||
1630 status.si_code == CLD_DUMPED) {
1632 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1636 log_error("Container failed due to unknown reason.");
1643 if (saved_attr_valid)
1644 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1647 close_nointr_nofail(master);
1649 close_pipe(kmsg_socket_pair);
1652 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1655 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1657 free(arg_directory);
1658 strv_free(arg_controllers);