1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
63 typedef enum LinkJournal {
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
80 (1ULL << CAP_DAC_OVERRIDE) |
81 (1ULL << CAP_DAC_READ_SEARCH) |
82 (1ULL << CAP_FOWNER) |
83 (1ULL << CAP_FSETID) |
84 (1ULL << CAP_IPC_OWNER) |
87 (1ULL << CAP_LINUX_IMMUTABLE) |
88 (1ULL << CAP_NET_BIND_SERVICE) |
89 (1ULL << CAP_NET_BROADCAST) |
90 (1ULL << CAP_NET_RAW) |
91 (1ULL << CAP_SETGID) |
92 (1ULL << CAP_SETFCAP) |
93 (1ULL << CAP_SETPCAP) |
94 (1ULL << CAP_SETUID) |
95 (1ULL << CAP_SYS_ADMIN) |
96 (1ULL << CAP_SYS_CHROOT) |
97 (1ULL << CAP_SYS_NICE) |
98 (1ULL << CAP_SYS_PTRACE) |
99 (1ULL << CAP_SYS_TTY_CONFIG) |
100 (1ULL << CAP_SYS_RESOURCE) |
101 (1ULL << CAP_SYS_BOOT) |
102 (1ULL << CAP_AUDIT_WRITE) |
103 (1ULL << CAP_AUDIT_CONTROL);
104 static char **arg_bind = NULL;
105 static char **arg_bind_ro = NULL;
107 static int help(void) {
109 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
110 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
111 " -h --help Show this help\n"
112 " --version Print version string\n"
113 " -D --directory=NAME Root directory for the container\n"
114 " -b --boot Boot up full system (i.e. invoke init)\n"
115 " -u --user=USER Run the command under specified user or uid\n"
116 " -C --controllers=LIST Put the container in specified comma-separated\n"
117 " cgroup hierarchies\n"
118 " --uuid=UUID Set a specific machine UUID for the container\n"
119 " --private-network Disable network in container\n"
120 " --read-only Mount the root directory read-only\n"
121 " --capability=CAP In addition to the default, retain specified\n"
123 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
124 " -j Equivalent to --link-journal=host\n"
125 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
127 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
128 program_invocation_short_name);
133 static int parse_argv(int argc, char *argv[]) {
146 static const struct option options[] = {
147 { "help", no_argument, NULL, 'h' },
148 { "version", no_argument, NULL, ARG_VERSION },
149 { "directory", required_argument, NULL, 'D' },
150 { "user", required_argument, NULL, 'u' },
151 { "controllers", required_argument, NULL, 'C' },
152 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
153 { "boot", no_argument, NULL, 'b' },
154 { "uuid", required_argument, NULL, ARG_UUID },
155 { "read-only", no_argument, NULL, ARG_READ_ONLY },
156 { "capability", required_argument, NULL, ARG_CAPABILITY },
157 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
158 { "bind", required_argument, NULL, ARG_BIND },
159 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
168 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
177 puts(PACKAGE_STRING);
178 puts(SYSTEMD_FEATURES);
183 arg_directory = canonicalize_file_name(optarg);
184 if (!arg_directory) {
185 log_error("Failed to canonicalize root directory.");
193 if (!(arg_user = strdup(optarg))) {
194 log_error("Failed to duplicate user name.");
201 strv_free(arg_controllers);
202 arg_controllers = strv_split(optarg, ",");
203 if (!arg_controllers) {
204 log_error("Failed to split controllers list.");
207 strv_uniq(arg_controllers);
211 case ARG_PRIVATE_NETWORK:
212 arg_private_network = true;
224 arg_read_only = true;
227 case ARG_CAPABILITY: {
231 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
235 t = strndup(word, length);
239 if (cap_from_name(t, &cap) < 0) {
240 log_error("Failed to parse capability %s.", t);
246 arg_retain |= 1ULL << (uint64_t) cap;
253 arg_link_journal = LINK_GUEST;
256 case ARG_LINK_JOURNAL:
257 if (streq(optarg, "auto"))
258 arg_link_journal = LINK_AUTO;
259 else if (streq(optarg, "no"))
260 arg_link_journal = LINK_NO;
261 else if (streq(optarg, "guest"))
262 arg_link_journal = LINK_GUEST;
263 else if (streq(optarg, "host"))
264 arg_link_journal = LINK_HOST;
266 log_error("Failed to parse link journal mode %s", optarg);
274 _cleanup_free_ char *a = NULL, *b = NULL;
279 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
281 e = strchr(optarg, ':');
283 a = strndup(optarg, e - optarg);
293 if (!path_is_absolute(a) || !path_is_absolute(b)) {
294 log_error("Invalid bind mount specification: %s", optarg);
298 r = strv_extend(x, a);
302 r = strv_extend(x, b);
313 log_error("Unknown option code %c", c);
321 static int mount_all(const char *dest) {
323 typedef struct MountPoint {
332 static const MountPoint mount_table[] = {
333 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
334 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
335 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
336 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
337 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
338 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
339 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
340 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
342 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
343 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
350 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
351 char _cleanup_free_ *where = NULL;
354 where = strjoin(dest, "/", mount_table[k].where, NULL);
358 t = path_is_mount_point(where, true);
360 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
368 /* Skip this entry if it is not a remount. */
369 if (mount_table[k].what && t > 0)
372 mkdir_p(where, 0755);
374 if (mount(mount_table[k].what,
377 mount_table[k].flags,
378 mount_table[k].options) < 0 &&
379 mount_table[k].fatal) {
381 log_error("mount(%s) failed: %m", where);
391 static int mount_binds(const char *dest, char **l, unsigned long flags) {
394 STRV_FOREACH_PAIR(x, y, l) {
395 _cleanup_free_ char *where = NULL;
397 where = strjoin(dest, "/", *y, NULL);
401 mkdir_p_label(where, 0755);
403 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
404 log_error("mount(%s) failed: %m", where);
408 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
409 log_error("mount(%s) failed: %m", where);
417 static int setup_timezone(const char *dest) {
418 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
424 /* Fix the timezone, if possible */
425 r = readlink_malloc("/etc/localtime", &p);
427 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
431 z = path_startswith(p, "../usr/share/zoneinfo/");
433 z = path_startswith(p, "/usr/share/zoneinfo/");
435 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
439 where = strappend(dest, "/etc/localtime");
443 r = readlink_malloc(where, &q);
445 y = path_startswith(q, "../usr/share/zoneinfo/");
447 y = path_startswith(q, "/usr/share/zoneinfo/");
450 /* Already pointing to the right place? Then do nothing .. */
451 if (y && streq(y, z))
455 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
459 if (access(check, F_OK) < 0) {
460 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
464 what = strappend("../usr/share/zoneinfo/", z);
469 if (symlink(what, where) < 0) {
470 log_error("Failed to correct timezone of container: %m");
477 static int setup_resolv_conf(const char *dest) {
482 if (arg_private_network)
485 /* Fix resolv.conf, if possible */
486 where = strappend(dest, "/etc/resolv.conf");
490 /* We don't really care for the results of this really. If it
491 * fails, it fails, but meh... */
492 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
493 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
500 static int setup_boot_id(const char *dest) {
501 char _cleanup_free_ *from = NULL, *to = NULL;
508 /* Generate a new randomized boot ID, so that each boot-up of
509 * the container gets a new one */
511 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
512 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
516 r = sd_id128_randomize(&rnd);
518 log_error("Failed to generate random boot id: %s", strerror(-r));
522 snprintf(as_uuid, sizeof(as_uuid),
523 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
524 SD_ID128_FORMAT_VAL(rnd));
525 char_array_0(as_uuid);
527 r = write_one_line_file(from, as_uuid);
529 log_error("Failed to write boot id: %s", strerror(-r));
533 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
534 log_error("Failed to bind mount boot id: %m");
537 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
543 static int copy_devnodes(const char *dest) {
545 static const char devnodes[] =
556 mode_t _cleanup_umask_ u;
562 NULSTR_FOREACH(d, devnodes) {
564 char _cleanup_free_ *from = NULL, *to = NULL;
566 asprintf(&from, "/dev/%s", d);
567 asprintf(&to, "%s/dev/%s", dest, d);
578 if (stat(from, &st) < 0) {
580 if (errno != ENOENT) {
581 log_error("Failed to stat %s: %m", from);
586 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
588 log_error("%s is not a char or block device, cannot copy", from);
592 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
594 log_error("mknod(%s) failed: %m", dest);
603 static int setup_dev_console(const char *dest, const char *console) {
605 char _cleanup_free_ *to = NULL;
607 mode_t _cleanup_umask_ u;
614 if (stat(console, &st) < 0) {
615 log_error("Failed to stat %s: %m", console);
618 } else if (!S_ISCHR(st.st_mode)) {
619 log_error("/dev/console is not a char device");
623 r = chmod_and_chown(console, 0600, 0, 0);
625 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
629 if (asprintf(&to, "%s/dev/console", dest) < 0)
632 /* We need to bind mount the right tty to /dev/console since
633 * ptys can only exist on pts file systems. To have something
634 * to bind mount things on we create a device node first, that
635 * has the right major/minor (note that the major minor
636 * doesn't actually matter here, since we mount it over
639 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
640 log_error("mknod() for /dev/console failed: %m");
644 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
645 log_error("Bind mount for /dev/console failed: %m");
652 static int setup_kmsg(const char *dest, int kmsg_socket) {
653 char _cleanup_free_ *from = NULL, *to = NULL;
655 mode_t _cleanup_umask_ u;
657 struct cmsghdr cmsghdr;
658 uint8_t buf[CMSG_SPACE(sizeof(int))];
661 struct cmsghdr *cmsg;
664 assert(kmsg_socket >= 0);
668 /* We create the kmsg FIFO as /dev/kmsg, but immediately
669 * delete it after bind mounting it to /proc/kmsg. While FIFOs
670 * on the reading side behave very similar to /proc/kmsg,
671 * their writing side behaves differently from /dev/kmsg in
672 * that writing blocks when nothing is reading. In order to
673 * avoid any problems with containers deadlocking due to this
674 * we simply make /dev/kmsg unavailable to the container. */
675 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
676 asprintf(&to, "%s/proc/kmsg", dest) < 0)
679 if (mkfifo(from, 0600) < 0) {
680 log_error("mkfifo() for /dev/kmsg failed: %m");
684 r = chmod_and_chown(from, 0600, 0, 0);
686 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
690 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
691 log_error("Bind mount for /proc/kmsg failed: %m");
695 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
697 log_error("Failed to open fifo: %m");
704 mh.msg_control = &control;
705 mh.msg_controllen = sizeof(control);
707 cmsg = CMSG_FIRSTHDR(&mh);
708 cmsg->cmsg_level = SOL_SOCKET;
709 cmsg->cmsg_type = SCM_RIGHTS;
710 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
711 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
713 mh.msg_controllen = cmsg->cmsg_len;
715 /* Store away the fd in the socket, so that it stays open as
716 * long as we run the child */
717 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
718 close_nointr_nofail(fd);
721 log_error("Failed to send FIFO fd: %m");
725 /* And now make the FIFO unavailable as /dev/kmsg... */
730 static int setup_hostname(void) {
734 hn = path_get_file_name(arg_directory);
740 hostname_cleanup(hn);
743 if (sethostname(hn, strlen(hn)) < 0)
752 static int setup_journal(const char *directory) {
753 sd_id128_t machine_id;
754 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
758 if (arg_link_journal == LINK_NO)
761 p = strappend(directory, "/etc/machine-id");
765 r = read_one_line_file(p, &b);
766 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
769 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
774 if (isempty(id) && arg_link_journal == LINK_AUTO)
777 /* Verify validity */
778 r = sd_id128_from_string(id, &machine_id);
780 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
785 p = strappend("/var/log/journal/", id);
786 q = strjoin(directory, "/var/log/journal/", id, NULL);
790 if (path_is_mount_point(p, false) > 0) {
791 if (arg_link_journal != LINK_AUTO) {
792 log_error("%s: already a mount point, refusing to use for journal", p);
799 if (path_is_mount_point(q, false) > 0) {
800 if (arg_link_journal != LINK_AUTO) {
801 log_error("%s: already a mount point, refusing to use for journal", q);
808 r = readlink_and_make_absolute(p, &d);
810 if ((arg_link_journal == LINK_GUEST ||
811 arg_link_journal == LINK_AUTO) &&
814 r = mkdir_p(q, 0755);
816 log_warning("failed to create directory %s: %m", q);
821 log_error("Failed to remove symlink %s: %m", p);
824 } else if (r == -EINVAL) {
826 if (arg_link_journal == LINK_GUEST &&
829 if (errno == ENOTDIR) {
830 log_error("%s already exists and is neither a symlink nor a directory", p);
833 log_error("Failed to remove %s: %m", p);
837 } else if (r != -ENOENT) {
838 log_error("readlink(%s) failed: %m", p);
842 if (arg_link_journal == LINK_GUEST) {
844 if (symlink(q, p) < 0) {
845 log_error("Failed to symlink %s to %s: %m", q, p);
849 r = mkdir_p(q, 0755);
851 log_warning("failed to create directory %s: %m", q);
855 if (arg_link_journal == LINK_HOST) {
856 r = mkdir_p(p, 0755);
858 log_error("Failed to create %s: %m", p);
862 } else if (access(p, F_OK) < 0)
865 if (dir_is_empty(q) == 0) {
866 log_error("%s not empty.", q);
870 r = mkdir_p(q, 0755);
872 log_error("Failed to create %s: %m", q);
876 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
877 log_error("Failed to bind mount journal from host into guest: %m");
884 static int drop_capabilities(void) {
885 return capability_bounding_set_drop(~arg_retain, false);
888 static int is_os_tree(const char *path) {
891 /* We use /bin/sh as flag file if something is an OS */
893 if (asprintf(&p, "%s/bin/sh", path) < 0)
899 return r < 0 ? 0 : 1;
902 static int process_pty(int master, pid_t pid, sigset_t *mask) {
904 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
905 size_t in_buffer_full = 0, out_buffer_full = 0;
906 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
907 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
908 int ep = -1, signal_fd = -1, r;
909 bool tried_orderly_shutdown = false;
915 fd_nonblock(STDIN_FILENO, 1);
916 fd_nonblock(STDOUT_FILENO, 1);
917 fd_nonblock(master, 1);
919 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
921 log_error("signalfd(): %m");
926 ep = epoll_create1(EPOLL_CLOEXEC);
928 log_error("Failed to create epoll: %m");
933 /* We read from STDIN only if this is actually a TTY,
934 * otherwise we assume non-interactivity. */
935 if (isatty(STDIN_FILENO)) {
937 stdin_ev.events = EPOLLIN|EPOLLET;
938 stdin_ev.data.fd = STDIN_FILENO;
940 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
941 log_error("Failed to register STDIN in epoll: %m");
948 stdout_ev.events = EPOLLOUT|EPOLLET;
949 stdout_ev.data.fd = STDOUT_FILENO;
952 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
953 master_ev.data.fd = master;
956 signal_ev.events = EPOLLIN;
957 signal_ev.data.fd = signal_fd;
959 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
960 if (errno != EPERM) {
961 log_error("Failed to register stdout in epoll: %m");
965 /* stdout without epoll support. Likely redirected to regular file. */
966 stdout_writable = true;
969 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
970 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
971 log_error("Failed to register fds in epoll: %m");
977 struct epoll_event ev[16];
981 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
984 if (errno == EINTR || errno == EAGAIN)
987 log_error("epoll_wait(): %m");
994 for (i = 0; i < nfds; i++) {
995 if (ev[i].data.fd == STDIN_FILENO) {
997 if (ev[i].events & (EPOLLIN|EPOLLHUP))
998 stdin_readable = true;
1000 } else if (ev[i].data.fd == STDOUT_FILENO) {
1002 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1003 stdout_writable = true;
1005 } else if (ev[i].data.fd == master) {
1007 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1008 master_readable = true;
1010 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1011 master_writable = true;
1013 } else if (ev[i].data.fd == signal_fd) {
1014 struct signalfd_siginfo sfsi;
1017 n = read(signal_fd, &sfsi, sizeof(sfsi));
1018 if (n != sizeof(sfsi)) {
1021 log_error("Failed to read from signalfd: invalid block size");
1026 if (errno != EINTR && errno != EAGAIN) {
1027 log_error("Failed to read from signalfd: %m");
1033 if (sfsi.ssi_signo == SIGWINCH) {
1036 /* The window size changed, let's forward that. */
1037 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1038 ioctl(master, TIOCSWINSZ, &ws);
1039 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1041 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1043 /* This only works for systemd... */
1044 tried_orderly_shutdown = true;
1045 kill(pid, SIGRTMIN+3);
1055 while ((stdin_readable && in_buffer_full <= 0) ||
1056 (master_writable && in_buffer_full > 0) ||
1057 (master_readable && out_buffer_full <= 0) ||
1058 (stdout_writable && out_buffer_full > 0)) {
1060 if (stdin_readable && in_buffer_full < LINE_MAX) {
1062 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1065 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1066 stdin_readable = false;
1068 log_error("read(): %m");
1073 in_buffer_full += (size_t) k;
1076 if (master_writable && in_buffer_full > 0) {
1078 k = write(master, in_buffer, in_buffer_full);
1081 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1082 master_writable = false;
1084 log_error("write(): %m");
1090 assert(in_buffer_full >= (size_t) k);
1091 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1092 in_buffer_full -= k;
1096 if (master_readable && out_buffer_full < LINE_MAX) {
1098 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1101 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1102 master_readable = false;
1104 log_error("read(): %m");
1109 out_buffer_full += (size_t) k;
1112 if (stdout_writable && out_buffer_full > 0) {
1114 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1117 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1118 stdout_writable = false;
1120 log_error("write(): %m");
1126 assert(out_buffer_full >= (size_t) k);
1127 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1128 out_buffer_full -= k;
1136 close_nointr_nofail(ep);
1139 close_nointr_nofail(signal_fd);
1144 int main(int argc, char *argv[]) {
1146 int r = EXIT_FAILURE, k;
1147 char *oldcg = NULL, *newcg = NULL;
1148 char **controller = NULL;
1149 int master = -1, n_fd_passed;
1150 const char *console = NULL;
1151 struct termios saved_attr, raw_attr;
1153 bool saved_attr_valid = false;
1155 int kmsg_socket_pair[2] = { -1, -1 };
1158 log_parse_environment();
1161 r = parse_argv(argc, argv);
1165 if (arg_directory) {
1168 p = path_make_absolute_cwd(arg_directory);
1169 free(arg_directory);
1172 arg_directory = get_current_dir_name();
1174 if (!arg_directory) {
1175 log_error("Failed to determine path");
1179 path_kill_slashes(arg_directory);
1181 if (geteuid() != 0) {
1182 log_error("Need to be root.");
1186 if (sd_booted() <= 0) {
1187 log_error("Not running on a systemd system.");
1191 if (path_equal(arg_directory, "/")) {
1192 log_error("Spawning container on root directory not supported.");
1196 if (is_os_tree(arg_directory) <= 0) {
1197 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1202 n_fd_passed = sd_listen_fds(false);
1203 if (n_fd_passed > 0) {
1204 k = fdset_new_listen_fds(&fds, false);
1206 log_error("Failed to collect file descriptors: %s", strerror(-k));
1210 fdset_close_others(fds);
1213 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1215 log_error("Failed to determine current cgroup: %s", strerror(-k));
1219 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1220 log_error("Failed to allocate cgroup path.");
1224 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1226 log_error("Failed to create cgroup: %s", strerror(-k));
1230 STRV_FOREACH(controller, arg_controllers) {
1231 k = cg_create_and_attach(*controller, newcg, 0);
1233 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1236 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1238 log_error("Failed to acquire pseudo tty: %m");
1242 console = ptsname(master);
1244 log_error("Failed to determine tty name: %m");
1248 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1250 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1251 ioctl(master, TIOCSWINSZ, &ws);
1253 if (unlockpt(master) < 0) {
1254 log_error("Failed to unlock tty: %m");
1258 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1259 saved_attr_valid = true;
1261 raw_attr = saved_attr;
1262 cfmakeraw(&raw_attr);
1263 raw_attr.c_lflag &= ~ECHO;
1266 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1267 log_error("Failed to create kmsg socket pair");
1271 assert_se(sigemptyset(&mask) == 0);
1272 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1273 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1279 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1280 log_error("pipe2(): %m");
1284 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1286 if (errno == EINVAL)
1287 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1289 log_error("clone() failed: %m");
1296 const char *home = NULL;
1297 uid_t uid = (uid_t) -1;
1298 gid_t gid = (gid_t) -1;
1300 const char *envp[] = {
1301 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1302 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1307 NULL, /* container_uuid */
1308 NULL, /* LISTEN_FDS */
1309 NULL, /* LISTEN_PID */
1313 envp[2] = strv_find_prefix(environ, "TERM=");
1316 close_nointr_nofail(pipefd[1]);
1317 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1318 close_nointr_nofail(pipefd[0]);
1320 close_nointr_nofail(master);
1323 if (saved_attr_valid) {
1324 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1325 log_error("Failed to set terminal attributes: %m");
1330 close_nointr(STDIN_FILENO);
1331 close_nointr(STDOUT_FILENO);
1332 close_nointr(STDERR_FILENO);
1334 close_nointr_nofail(kmsg_socket_pair[0]);
1335 kmsg_socket_pair[0] = -1;
1337 reset_all_signal_handlers();
1339 assert_se(sigemptyset(&mask) == 0);
1340 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1342 k = open_terminal(console, O_RDWR);
1343 if (k != STDIN_FILENO) {
1345 close_nointr_nofail(k);
1349 log_error("Failed to open console: %s", strerror(-k));
1353 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1354 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1355 log_error("Failed to duplicate console: %m");
1360 log_error("setsid() failed: %m");
1364 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1365 log_error("PR_SET_PDEATHSIG failed: %m");
1369 /* Mark everything as slave, so that we still
1370 * receive mounts from the real root, but don't
1371 * propagate mounts to the real root. */
1372 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1373 log_error("MS_SLAVE|MS_REC failed: %m");
1377 /* Turn directory into bind mount */
1378 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1379 log_error("Failed to make bind mount.");
1384 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1385 log_error("Failed to make read-only.");
1389 if (mount_all(arg_directory) < 0)
1392 if (copy_devnodes(arg_directory) < 0)
1395 dev_setup(arg_directory);
1397 if (setup_dev_console(arg_directory, console) < 0)
1400 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1403 close_nointr_nofail(kmsg_socket_pair[1]);
1404 kmsg_socket_pair[1] = -1;
1406 if (setup_boot_id(arg_directory) < 0)
1409 if (setup_timezone(arg_directory) < 0)
1412 if (setup_resolv_conf(arg_directory) < 0)
1415 if (setup_journal(arg_directory) < 0)
1418 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1421 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1424 if (chdir(arg_directory) < 0) {
1425 log_error("chdir(%s) failed: %m", arg_directory);
1429 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1430 log_error("mount(MS_MOVE) failed: %m");
1434 if (chroot(".") < 0) {
1435 log_error("chroot() failed: %m");
1439 if (chdir("/") < 0) {
1440 log_error("chdir() failed: %m");
1448 if (drop_capabilities() < 0) {
1449 log_error("drop_capabilities() failed: %m");
1455 /* Note that this resolves user names
1456 * inside the container, and hence
1457 * accesses the NSS modules from the
1458 * container and not the host. This is
1461 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1462 log_error("get_user_creds() failed: %m");
1466 if (mkdir_parents_label(home, 0775) < 0) {
1467 log_error("mkdir_parents_label() failed: %m");
1471 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1472 log_error("mkdir_safe_label() failed: %m");
1476 if (initgroups((const char*)arg_user, gid) < 0) {
1477 log_error("initgroups() failed: %m");
1481 if (setresgid(gid, gid, gid) < 0) {
1482 log_error("setregid() failed: %m");
1486 if (setresuid(uid, uid, uid) < 0) {
1487 log_error("setreuid() failed: %m");
1491 /* Reset everything fully to 0, just in case */
1493 if (setgroups(0, NULL) < 0) {
1494 log_error("setgroups() failed: %m");
1498 if (setresgid(0, 0, 0) < 0) {
1499 log_error("setregid() failed: %m");
1503 if (setresuid(0, 0, 0) < 0) {
1504 log_error("setreuid() failed: %m");
1509 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1510 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1511 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1517 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1523 if (fdset_size(fds) > 0) {
1524 k = fdset_cloexec(fds, false);
1526 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1530 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1531 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1543 /* Automatically search for the init system */
1545 l = 1 + argc - optind;
1546 a = newa(char*, l + 1);
1547 memcpy(a + 1, argv + optind, l * sizeof(char*));
1549 a[0] = (char*) "/usr/lib/systemd/systemd";
1550 execve(a[0], a, (char**) envp);
1552 a[0] = (char*) "/lib/systemd/systemd";
1553 execve(a[0], a, (char**) envp);
1555 a[0] = (char*) "/sbin/init";
1556 execve(a[0], a, (char**) envp);
1557 } else if (argc > optind)
1558 execvpe(argv[optind], argv + optind, (char**) envp);
1560 chdir(home ? home : "/root");
1561 execle("/bin/bash", "-bash", NULL, (char**) envp);
1564 log_error("execv() failed: %m");
1567 _exit(EXIT_FAILURE);
1570 log_info("Init process in the container running as PID %d", pid);
1571 close_nointr_nofail(pipefd[0]);
1572 close_nointr_nofail(pipefd[1]);
1577 if (process_pty(master, pid, &mask) < 0)
1580 if (saved_attr_valid)
1581 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1583 r = wait_for_terminate(pid, &status);
1589 if (status.si_code == CLD_EXITED) {
1590 if (status.si_status != 0) {
1591 log_error("Container failed with error code %i.", status.si_status);
1592 r = status.si_status;
1596 log_debug("Container exited successfully.");
1598 } else if (status.si_code == CLD_KILLED &&
1599 status.si_status == SIGINT) {
1600 log_info("Container has been shut down.");
1603 } else if (status.si_code == CLD_KILLED &&
1604 status.si_status == SIGHUP) {
1605 log_info("Container is being rebooted.");
1607 } else if (status.si_code == CLD_KILLED ||
1608 status.si_code == CLD_DUMPED) {
1610 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1614 log_error("Container failed due to unknown reason.");
1621 if (saved_attr_valid)
1622 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1625 close_nointr_nofail(master);
1627 close_pipe(kmsg_socket_pair);
1630 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1633 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1635 free(arg_directory);
1636 strv_free(arg_controllers);