1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
63 typedef enum LinkJournal {
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
80 (1ULL << CAP_DAC_OVERRIDE) |
81 (1ULL << CAP_DAC_READ_SEARCH) |
82 (1ULL << CAP_FOWNER) |
83 (1ULL << CAP_FSETID) |
84 (1ULL << CAP_IPC_OWNER) |
87 (1ULL << CAP_LINUX_IMMUTABLE) |
88 (1ULL << CAP_NET_BIND_SERVICE) |
89 (1ULL << CAP_NET_BROADCAST) |
90 (1ULL << CAP_NET_RAW) |
91 (1ULL << CAP_SETGID) |
92 (1ULL << CAP_SETFCAP) |
93 (1ULL << CAP_SETPCAP) |
94 (1ULL << CAP_SETUID) |
95 (1ULL << CAP_SYS_ADMIN) |
96 (1ULL << CAP_SYS_CHROOT) |
97 (1ULL << CAP_SYS_NICE) |
98 (1ULL << CAP_SYS_PTRACE) |
99 (1ULL << CAP_SYS_TTY_CONFIG) |
100 (1ULL << CAP_SYS_RESOURCE) |
101 (1ULL << CAP_SYS_BOOT) |
102 (1ULL << CAP_AUDIT_WRITE) |
103 (1ULL << CAP_AUDIT_CONTROL);
104 static char **arg_bind = NULL;
105 static char **arg_bind_ro = NULL;
107 static int help(void) {
109 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
110 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
111 " -h --help Show this help\n"
112 " --version Print version string\n"
113 " -D --directory=NAME Root directory for the container\n"
114 " -b --boot Boot up full system (i.e. invoke init)\n"
115 " -u --user=USER Run the command under specified user or uid\n"
116 " -C --controllers=LIST Put the container in specified comma-separated\n"
117 " cgroup hierarchies\n"
118 " --uuid=UUID Set a specific machine UUID for the container\n"
119 " --private-network Disable network in container\n"
120 " --read-only Mount the root directory read-only\n"
121 " --capability=CAP In addition to the default, retain specified\n"
123 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
124 " -j Equivalent to --link-journal=host\n"
125 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
127 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
128 program_invocation_short_name);
133 static int parse_argv(int argc, char *argv[]) {
146 static const struct option options[] = {
147 { "help", no_argument, NULL, 'h' },
148 { "version", no_argument, NULL, ARG_VERSION },
149 { "directory", required_argument, NULL, 'D' },
150 { "user", required_argument, NULL, 'u' },
151 { "controllers", required_argument, NULL, 'C' },
152 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
153 { "boot", no_argument, NULL, 'b' },
154 { "uuid", required_argument, NULL, ARG_UUID },
155 { "read-only", no_argument, NULL, ARG_READ_ONLY },
156 { "capability", required_argument, NULL, ARG_CAPABILITY },
157 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
158 { "bind", required_argument, NULL, ARG_BIND },
159 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
168 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
177 puts(PACKAGE_STRING);
178 puts(SYSTEMD_FEATURES);
183 arg_directory = canonicalize_file_name(optarg);
184 if (!arg_directory) {
185 log_error("Failed to canonicalize root directory.");
193 if (!(arg_user = strdup(optarg))) {
194 log_error("Failed to duplicate user name.");
201 strv_free(arg_controllers);
202 arg_controllers = strv_split(optarg, ",");
203 if (!arg_controllers) {
204 log_error("Failed to split controllers list.");
207 strv_uniq(arg_controllers);
211 case ARG_PRIVATE_NETWORK:
212 arg_private_network = true;
224 arg_read_only = true;
227 case ARG_CAPABILITY: {
231 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
235 t = strndup(word, length);
239 if (cap_from_name(t, &cap) < 0) {
240 log_error("Failed to parse capability %s.", t);
246 arg_retain |= 1ULL << (uint64_t) cap;
253 arg_link_journal = LINK_GUEST;
256 case ARG_LINK_JOURNAL:
257 if (streq(optarg, "auto"))
258 arg_link_journal = LINK_AUTO;
259 else if (streq(optarg, "no"))
260 arg_link_journal = LINK_NO;
261 else if (streq(optarg, "guest"))
262 arg_link_journal = LINK_GUEST;
263 else if (streq(optarg, "host"))
264 arg_link_journal = LINK_HOST;
266 log_error("Failed to parse link journal mode %s", optarg);
274 _cleanup_free_ char *a = NULL, *b = NULL;
279 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
281 e = strchr(optarg, ':');
283 a = strndup(optarg, e - optarg);
293 if (!path_is_absolute(a) || !path_is_absolute(b)) {
294 log_error("Invalid bind mount specification: %s", optarg);
298 r = strv_extend(x, a);
302 r = strv_extend(x, b);
313 log_error("Unknown option code %c", c);
321 static int mount_all(const char *dest) {
323 typedef struct MountPoint {
332 static const MountPoint mount_table[] = {
333 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
334 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
335 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
336 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
337 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
338 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
339 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
340 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
342 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
343 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
350 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
351 char _cleanup_free_ *where = NULL;
354 where = strjoin(dest, "/", mount_table[k].where, NULL);
358 t = path_is_mount_point(where, true);
360 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
368 /* Skip this entry if it is not a remount. */
369 if (mount_table[k].what && t > 0)
372 mkdir_p(where, 0755);
374 if (mount(mount_table[k].what,
377 mount_table[k].flags,
378 mount_table[k].options) < 0 &&
379 mount_table[k].fatal) {
381 log_error("mount(%s) failed: %m", where);
391 static int mount_binds(const char *dest, char **l, unsigned long flags) {
394 STRV_FOREACH_PAIR(x, y, l) {
395 _cleanup_free_ char *where = NULL;
397 where = strjoin(dest, "/", *y, NULL);
401 mkdir_p_label(where, 0755);
403 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
404 log_error("mount(%s) failed: %m", where);
408 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
409 log_error("mount(%s) failed: %m", where);
417 static int setup_timezone(const char *dest) {
418 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
424 /* Fix the timezone, if possible */
425 r = readlink_malloc("/etc/localtime", &p);
427 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
431 z = path_startswith(p, "../usr/share/zoneinfo/");
433 z = path_startswith(p, "/usr/share/zoneinfo/");
435 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
439 where = strappend(dest, "/etc/localtime");
443 r = readlink_malloc(where, &q);
445 y = path_startswith(q, "../usr/share/zoneinfo/");
447 y = path_startswith(q, "/usr/share/zoneinfo/");
450 /* Already pointing to the right place? Then do nothing .. */
451 if (y && streq(y, z))
455 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
459 if (access(check, F_OK) < 0) {
460 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
464 what = strappend("../usr/share/zoneinfo/", z);
469 if (symlink(what, where) < 0) {
470 log_error("Failed to correct timezone of container: %m");
477 static int setup_resolv_conf(const char *dest) {
482 if (arg_private_network)
485 /* Fix resolv.conf, if possible */
486 where = strappend(dest, "/etc/resolv.conf");
490 /* We don't really care for the results of this really. If it
491 * fails, it fails, but meh... */
492 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
493 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
500 static int setup_boot_id(const char *dest) {
501 char _cleanup_free_ *from = NULL, *to = NULL;
508 /* Generate a new randomized boot ID, so that each boot-up of
509 * the container gets a new one */
511 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
512 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
516 r = sd_id128_randomize(&rnd);
518 log_error("Failed to generate random boot id: %s", strerror(-r));
522 snprintf(as_uuid, sizeof(as_uuid),
523 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
524 SD_ID128_FORMAT_VAL(rnd));
525 char_array_0(as_uuid);
527 r = write_one_line_file(from, as_uuid);
529 log_error("Failed to write boot id: %s", strerror(-r));
533 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
534 log_error("Failed to bind mount boot id: %m");
537 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
543 static int copy_devnodes(const char *dest) {
545 static const char devnodes[] =
556 mode_t _cleanup_umask_ u;
562 NULSTR_FOREACH(d, devnodes) {
564 char _cleanup_free_ *from = NULL, *to = NULL;
566 asprintf(&from, "/dev/%s", d);
567 asprintf(&to, "%s/dev/%s", dest, d);
578 if (stat(from, &st) < 0) {
580 if (errno != ENOENT) {
581 log_error("Failed to stat %s: %m", from);
586 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
588 log_error("%s is not a char or block device, cannot copy", from);
592 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
594 log_error("mknod(%s) failed: %m", dest);
603 static int setup_dev_console(const char *dest, const char *console) {
605 char _cleanup_free_ *to = NULL;
607 mode_t _cleanup_umask_ u;
614 if (stat(console, &st) < 0) {
615 log_error("Failed to stat %s: %m", console);
618 } else if (!S_ISCHR(st.st_mode)) {
619 log_error("/dev/console is not a char device");
623 r = chmod_and_chown(console, 0600, 0, 0);
625 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
629 if (asprintf(&to, "%s/dev/console", dest) < 0)
632 /* We need to bind mount the right tty to /dev/console since
633 * ptys can only exist on pts file systems. To have something
634 * to bind mount things on we create a device node first, that
635 * has the right major/minor (note that the major minor
636 * doesn't actually matter here, since we mount it over
639 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
640 log_error("mknod() for /dev/console failed: %m");
644 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
645 log_error("Bind mount for /dev/console failed: %m");
652 static int setup_kmsg(const char *dest, int kmsg_socket) {
653 char _cleanup_free_ *from = NULL, *to = NULL;
655 mode_t _cleanup_umask_ u;
657 struct cmsghdr cmsghdr;
658 uint8_t buf[CMSG_SPACE(sizeof(int))];
661 struct cmsghdr *cmsg;
664 assert(kmsg_socket >= 0);
668 /* We create the kmsg FIFO as /dev/kmsg, but immediately
669 * delete it after bind mounting it to /proc/kmsg. While FIFOs
670 * on the reading side behave very similar to /proc/kmsg,
671 * their writing side behaves differently from /dev/kmsg in
672 * that writing blocks when nothing is reading. In order to
673 * avoid any problems with containers deadlocking due to this
674 * we simply make /dev/kmsg unavailable to the container. */
675 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
676 asprintf(&to, "%s/proc/kmsg", dest) < 0)
679 if (mkfifo(from, 0600) < 0) {
680 log_error("mkfifo() for /dev/kmsg failed: %m");
684 r = chmod_and_chown(from, 0600, 0, 0);
686 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
690 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
691 log_error("Bind mount for /proc/kmsg failed: %m");
695 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
697 log_error("Failed to open fifo: %m");
704 mh.msg_control = &control;
705 mh.msg_controllen = sizeof(control);
707 cmsg = CMSG_FIRSTHDR(&mh);
708 cmsg->cmsg_level = SOL_SOCKET;
709 cmsg->cmsg_type = SCM_RIGHTS;
710 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
711 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
713 mh.msg_controllen = cmsg->cmsg_len;
715 /* Store away the fd in the socket, so that it stays open as
716 * long as we run the child */
717 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
718 close_nointr_nofail(fd);
721 log_error("Failed to send FIFO fd: %m");
725 /* And now make the FIFO unavailable as /dev/kmsg... */
730 static int setup_hostname(void) {
734 hn = path_get_file_name(arg_directory);
740 hostname_cleanup(hn);
743 if (sethostname(hn, strlen(hn)) < 0)
752 static int setup_journal(const char *directory) {
753 sd_id128_t machine_id;
754 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
758 if (arg_link_journal == LINK_NO)
761 p = strappend(directory, "/etc/machine-id");
765 r = read_one_line_file(p, &b);
766 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
769 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
774 if (isempty(id) && arg_link_journal == LINK_AUTO)
777 /* Verify validity */
778 r = sd_id128_from_string(id, &machine_id);
780 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
785 p = strappend("/var/log/journal/", id);
786 q = strjoin(directory, "/var/log/journal/", id, NULL);
790 if (path_is_mount_point(p, false) > 0) {
791 if (arg_link_journal != LINK_AUTO) {
792 log_error("%s: already a mount point, refusing to use for journal", p);
799 if (path_is_mount_point(q, false) > 0) {
800 if (arg_link_journal != LINK_AUTO) {
801 log_error("%s: already a mount point, refusing to use for journal", q);
808 r = readlink_and_make_absolute(p, &d);
810 if ((arg_link_journal == LINK_GUEST ||
811 arg_link_journal == LINK_AUTO) &&
814 r = mkdir_p(q, 0755);
816 log_warning("failed to create directory %s: %m", q);
821 log_error("Failed to remove symlink %s: %m", p);
824 } else if (r == -EINVAL) {
826 if (arg_link_journal == LINK_GUEST &&
829 if (errno == ENOTDIR) {
830 log_error("%s already exists and is neither a symlink nor a directory", p);
833 log_error("Failed to remove %s: %m", p);
837 } else if (r != -ENOENT) {
838 log_error("readlink(%s) failed: %m", p);
842 if (arg_link_journal == LINK_GUEST) {
844 if (symlink(q, p) < 0) {
845 log_error("Failed to symlink %s to %s: %m", q, p);
849 r = mkdir_p(q, 0755);
851 log_warning("failed to create directory %s: %m", q);
855 if (arg_link_journal == LINK_HOST) {
856 r = mkdir_p(p, 0755);
858 log_error("Failed to create %s: %m", p);
862 } else if (access(p, F_OK) < 0)
865 if (dir_is_empty(q) == 0) {
866 log_error("%s not empty.", q);
870 r = mkdir_p(q, 0755);
872 log_error("Failed to create %s: %m", q);
876 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
877 log_error("Failed to bind mount journal from host into guest: %m");
884 static int drop_capabilities(void) {
885 return capability_bounding_set_drop(~arg_retain, false);
888 static int is_os_tree(const char *path) {
891 /* We use /bin/sh as flag file if something is an OS */
893 if (asprintf(&p, "%s/bin/sh", path) < 0)
899 return r < 0 ? 0 : 1;
902 static int process_pty(int master, pid_t pid, sigset_t *mask) {
904 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
905 size_t in_buffer_full = 0, out_buffer_full = 0;
906 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
907 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
908 int ep = -1, signal_fd = -1, r;
909 bool tried_orderly_shutdown = false;
915 fd_nonblock(STDIN_FILENO, 1);
916 fd_nonblock(STDOUT_FILENO, 1);
917 fd_nonblock(master, 1);
919 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
921 log_error("signalfd(): %m");
926 ep = epoll_create1(EPOLL_CLOEXEC);
928 log_error("Failed to create epoll: %m");
933 /* We read from STDIN only if this is actually a TTY,
934 * otherwise we assume non-interactivity. */
935 if (isatty(STDIN_FILENO)) {
937 stdin_ev.events = EPOLLIN|EPOLLET;
938 stdin_ev.data.fd = STDIN_FILENO;
940 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
941 log_error("Failed to register STDIN in epoll: %m");
948 stdout_ev.events = EPOLLOUT|EPOLLET;
949 stdout_ev.data.fd = STDOUT_FILENO;
952 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
953 master_ev.data.fd = master;
956 signal_ev.events = EPOLLIN;
957 signal_ev.data.fd = signal_fd;
959 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
960 if (errno != EPERM) {
961 log_error("Failed to register stdout in epoll: %m");
965 /* stdout without epoll support. Likely redirected to regular file. */
966 stdout_writable = true;
969 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
970 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
971 log_error("Failed to register fds in epoll: %m");
977 struct epoll_event ev[16];
981 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
984 if (errno == EINTR || errno == EAGAIN)
987 log_error("epoll_wait(): %m");
994 for (i = 0; i < nfds; i++) {
995 if (ev[i].data.fd == STDIN_FILENO) {
997 if (ev[i].events & (EPOLLIN|EPOLLHUP))
998 stdin_readable = true;
1000 } else if (ev[i].data.fd == STDOUT_FILENO) {
1002 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1003 stdout_writable = true;
1005 } else if (ev[i].data.fd == master) {
1007 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1008 master_readable = true;
1010 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1011 master_writable = true;
1013 } else if (ev[i].data.fd == signal_fd) {
1014 struct signalfd_siginfo sfsi;
1017 n = read(signal_fd, &sfsi, sizeof(sfsi));
1018 if (n != sizeof(sfsi)) {
1021 log_error("Failed to read from signalfd: invalid block size");
1026 if (errno != EINTR && errno != EAGAIN) {
1027 log_error("Failed to read from signalfd: %m");
1033 if (sfsi.ssi_signo == SIGWINCH) {
1036 /* The window size changed, let's forward that. */
1037 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1038 ioctl(master, TIOCSWINSZ, &ws);
1039 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1041 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1043 /* This only works for systemd... */
1044 tried_orderly_shutdown = true;
1045 kill(pid, SIGRTMIN+3);
1055 while ((stdin_readable && in_buffer_full <= 0) ||
1056 (master_writable && in_buffer_full > 0) ||
1057 (master_readable && out_buffer_full <= 0) ||
1058 (stdout_writable && out_buffer_full > 0)) {
1060 if (stdin_readable && in_buffer_full < LINE_MAX) {
1062 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1065 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1066 stdin_readable = false;
1068 log_error("read(): %m");
1073 in_buffer_full += (size_t) k;
1076 if (master_writable && in_buffer_full > 0) {
1078 k = write(master, in_buffer, in_buffer_full);
1081 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1082 master_writable = false;
1084 log_error("write(): %m");
1090 assert(in_buffer_full >= (size_t) k);
1091 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1092 in_buffer_full -= k;
1096 if (master_readable && out_buffer_full < LINE_MAX) {
1098 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1101 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1102 master_readable = false;
1104 log_error("read(): %m");
1109 out_buffer_full += (size_t) k;
1112 if (stdout_writable && out_buffer_full > 0) {
1114 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1117 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1118 stdout_writable = false;
1120 log_error("write(): %m");
1126 assert(out_buffer_full >= (size_t) k);
1127 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1128 out_buffer_full -= k;
1136 close_nointr_nofail(ep);
1139 close_nointr_nofail(signal_fd);
1144 int main(int argc, char *argv[]) {
1146 int r = EXIT_FAILURE, k;
1147 char *oldcg = NULL, *newcg = NULL;
1148 char **controller = NULL;
1149 int master = -1, n_fd_passed;
1150 const char *console = NULL;
1151 struct termios saved_attr, raw_attr;
1153 bool saved_attr_valid = false;
1155 int kmsg_socket_pair[2] = { -1, -1 };
1158 log_parse_environment();
1161 r = parse_argv(argc, argv);
1165 if (arg_directory) {
1168 p = path_make_absolute_cwd(arg_directory);
1169 free(arg_directory);
1172 arg_directory = get_current_dir_name();
1174 if (!arg_directory) {
1175 log_error("Failed to determine path");
1179 path_kill_slashes(arg_directory);
1181 if (geteuid() != 0) {
1182 log_error("Need to be root.");
1186 if (sd_booted() <= 0) {
1187 log_error("Not running on a systemd system.");
1191 if (path_equal(arg_directory, "/")) {
1192 log_error("Spawning container on root directory not supported.");
1196 if (is_os_tree(arg_directory) <= 0) {
1197 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1202 n_fd_passed = sd_listen_fds(false);
1203 if (n_fd_passed > 0) {
1204 k = fdset_new_listen_fds(&fds, false);
1206 log_error("Failed to collect file descriptors: %s", strerror(-k));
1210 fdset_close_others(fds);
1213 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1215 log_error("Failed to determine current cgroup: %s", strerror(-k));
1219 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1220 log_error("Failed to allocate cgroup path.");
1224 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1226 log_error("Failed to create cgroup: %s", strerror(-k));
1230 STRV_FOREACH(controller, arg_controllers) {
1231 k = cg_create_and_attach(*controller, newcg, 0);
1233 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1236 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1238 log_error("Failed to acquire pseudo tty: %m");
1242 console = ptsname(master);
1244 log_error("Failed to determine tty name: %m");
1248 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1250 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1251 ioctl(master, TIOCSWINSZ, &ws);
1253 if (unlockpt(master) < 0) {
1254 log_error("Failed to unlock tty: %m");
1258 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1259 saved_attr_valid = true;
1261 raw_attr = saved_attr;
1262 cfmakeraw(&raw_attr);
1263 raw_attr.c_lflag &= ~ECHO;
1266 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1267 log_error("Failed to create kmsg socket pair");
1271 assert_se(sigemptyset(&mask) == 0);
1272 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1273 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1279 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1280 log_error("pipe2(): %m");
1284 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1286 if (errno == EINVAL)
1287 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1289 log_error("clone() failed: %m");
1296 const char *home = NULL;
1297 uid_t uid = (uid_t) -1;
1298 gid_t gid = (gid_t) -1;
1300 const char *envp[] = {
1301 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1302 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1307 NULL, /* container_uuid */
1308 NULL, /* LISTEN_FDS */
1309 NULL, /* LISTEN_PID */
1313 envp[n_env] = strv_find_prefix(environ, "TERM=");
1317 close_nointr_nofail(pipefd[1]);
1318 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1319 close_nointr_nofail(pipefd[0]);
1321 close_nointr_nofail(master);
1324 if (saved_attr_valid) {
1325 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1326 log_error("Failed to set terminal attributes: %m");
1331 close_nointr(STDIN_FILENO);
1332 close_nointr(STDOUT_FILENO);
1333 close_nointr(STDERR_FILENO);
1335 close_nointr_nofail(kmsg_socket_pair[0]);
1336 kmsg_socket_pair[0] = -1;
1338 reset_all_signal_handlers();
1340 assert_se(sigemptyset(&mask) == 0);
1341 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1343 k = open_terminal(console, O_RDWR);
1344 if (k != STDIN_FILENO) {
1346 close_nointr_nofail(k);
1350 log_error("Failed to open console: %s", strerror(-k));
1354 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1355 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1356 log_error("Failed to duplicate console: %m");
1361 log_error("setsid() failed: %m");
1365 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1366 log_error("PR_SET_PDEATHSIG failed: %m");
1370 /* Mark everything as slave, so that we still
1371 * receive mounts from the real root, but don't
1372 * propagate mounts to the real root. */
1373 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1374 log_error("MS_SLAVE|MS_REC failed: %m");
1378 /* Turn directory into bind mount */
1379 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1380 log_error("Failed to make bind mount.");
1385 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1386 log_error("Failed to make read-only.");
1390 if (mount_all(arg_directory) < 0)
1393 if (copy_devnodes(arg_directory) < 0)
1396 dev_setup(arg_directory);
1398 if (setup_dev_console(arg_directory, console) < 0)
1401 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1404 close_nointr_nofail(kmsg_socket_pair[1]);
1405 kmsg_socket_pair[1] = -1;
1407 if (setup_boot_id(arg_directory) < 0)
1410 if (setup_timezone(arg_directory) < 0)
1413 if (setup_resolv_conf(arg_directory) < 0)
1416 if (setup_journal(arg_directory) < 0)
1419 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1422 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1425 if (chdir(arg_directory) < 0) {
1426 log_error("chdir(%s) failed: %m", arg_directory);
1430 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1431 log_error("mount(MS_MOVE) failed: %m");
1435 if (chroot(".") < 0) {
1436 log_error("chroot() failed: %m");
1440 if (chdir("/") < 0) {
1441 log_error("chdir() failed: %m");
1449 if (drop_capabilities() < 0) {
1450 log_error("drop_capabilities() failed: %m");
1456 /* Note that this resolves user names
1457 * inside the container, and hence
1458 * accesses the NSS modules from the
1459 * container and not the host. This is
1462 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1463 log_error("get_user_creds() failed: %m");
1467 if (mkdir_parents_label(home, 0775) < 0) {
1468 log_error("mkdir_parents_label() failed: %m");
1472 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1473 log_error("mkdir_safe_label() failed: %m");
1477 if (initgroups((const char*)arg_user, gid) < 0) {
1478 log_error("initgroups() failed: %m");
1482 if (setresgid(gid, gid, gid) < 0) {
1483 log_error("setregid() failed: %m");
1487 if (setresuid(uid, uid, uid) < 0) {
1488 log_error("setreuid() failed: %m");
1492 /* Reset everything fully to 0, just in case */
1494 if (setgroups(0, NULL) < 0) {
1495 log_error("setgroups() failed: %m");
1499 if (setresgid(0, 0, 0) < 0) {
1500 log_error("setregid() failed: %m");
1504 if (setresuid(0, 0, 0) < 0) {
1505 log_error("setreuid() failed: %m");
1510 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1511 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1512 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1518 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1524 if (fdset_size(fds) > 0) {
1525 k = fdset_cloexec(fds, false);
1527 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1531 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1532 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1544 /* Automatically search for the init system */
1546 l = 1 + argc - optind;
1547 a = newa(char*, l + 1);
1548 memcpy(a + 1, argv + optind, l * sizeof(char*));
1550 a[0] = (char*) "/usr/lib/systemd/systemd";
1551 execve(a[0], a, (char**) envp);
1553 a[0] = (char*) "/lib/systemd/systemd";
1554 execve(a[0], a, (char**) envp);
1556 a[0] = (char*) "/sbin/init";
1557 execve(a[0], a, (char**) envp);
1558 } else if (argc > optind)
1559 execvpe(argv[optind], argv + optind, (char**) envp);
1561 chdir(home ? home : "/root");
1562 execle("/bin/bash", "-bash", NULL, (char**) envp);
1565 log_error("execv() failed: %m");
1568 _exit(EXIT_FAILURE);
1571 log_info("Init process in the container running as PID %d", pid);
1572 close_nointr_nofail(pipefd[0]);
1573 close_nointr_nofail(pipefd[1]);
1578 if (process_pty(master, pid, &mask) < 0)
1581 if (saved_attr_valid)
1582 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1584 r = wait_for_terminate(pid, &status);
1590 if (status.si_code == CLD_EXITED) {
1591 if (status.si_status != 0) {
1592 log_error("Container failed with error code %i.", status.si_status);
1593 r = status.si_status;
1597 log_debug("Container exited successfully.");
1599 } else if (status.si_code == CLD_KILLED &&
1600 status.si_status == SIGINT) {
1601 log_info("Container has been shut down.");
1604 } else if (status.si_code == CLD_KILLED &&
1605 status.si_status == SIGHUP) {
1606 log_info("Container is being rebooted.");
1608 } else if (status.si_code == CLD_KILLED ||
1609 status.si_code == CLD_DUMPED) {
1611 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1615 log_error("Container failed due to unknown reason.");
1622 if (saved_attr_valid)
1623 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1626 close_nointr_nofail(master);
1628 close_pipe(kmsg_socket_pair);
1631 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1634 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1636 free(arg_directory);
1637 strv_free(arg_controllers);