1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
42 #include <linux/netlink.h>
44 #include <sys/socket.h>
47 #include <attr/xattr.h>
50 #include <systemd/sd-daemon.h>
58 #include "cgroup-util.h"
60 #include "path-util.h"
61 #include "loopback-setup.h"
63 #include "dev-setup.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static char **arg_controllers = NULL;
82 static char *arg_uuid = NULL;
83 static char *arg_machine = NULL;
84 static bool arg_private_network = false;
85 static bool arg_read_only = false;
86 static bool arg_boot = false;
87 static LinkJournal arg_link_journal = LINK_AUTO;
88 static uint64_t arg_retain =
90 (1ULL << CAP_DAC_OVERRIDE) |
91 (1ULL << CAP_DAC_READ_SEARCH) |
92 (1ULL << CAP_FOWNER) |
93 (1ULL << CAP_FSETID) |
94 (1ULL << CAP_IPC_OWNER) |
97 (1ULL << CAP_LINUX_IMMUTABLE) |
98 (1ULL << CAP_NET_BIND_SERVICE) |
99 (1ULL << CAP_NET_BROADCAST) |
100 (1ULL << CAP_NET_RAW) |
101 (1ULL << CAP_SETGID) |
102 (1ULL << CAP_SETFCAP) |
103 (1ULL << CAP_SETPCAP) |
104 (1ULL << CAP_SETUID) |
105 (1ULL << CAP_SYS_ADMIN) |
106 (1ULL << CAP_SYS_CHROOT) |
107 (1ULL << CAP_SYS_NICE) |
108 (1ULL << CAP_SYS_PTRACE) |
109 (1ULL << CAP_SYS_TTY_CONFIG) |
110 (1ULL << CAP_SYS_RESOURCE) |
111 (1ULL << CAP_SYS_BOOT) |
112 (1ULL << CAP_AUDIT_WRITE) |
113 (1ULL << CAP_AUDIT_CONTROL);
114 static char **arg_bind = NULL;
115 static char **arg_bind_ro = NULL;
117 static int help(void) {
119 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
120 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
121 " -h --help Show this help\n"
122 " --version Print version string\n"
123 " -D --directory=NAME Root directory for the container\n"
124 " -b --boot Boot up full system (i.e. invoke init)\n"
125 " -u --user=USER Run the command under specified user or uid\n"
126 " -C --controllers=LIST Put the container in specified comma-separated\n"
127 " cgroup hierarchies\n"
128 " --uuid=UUID Set a specific machine UUID for the container\n"
129 " -M --machine=NAME Set the machine name for the container\n"
130 " --private-network Disable network in container\n"
131 " --read-only Mount the root directory read-only\n"
132 " --capability=CAP In addition to the default, retain specified\n"
134 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
135 " -j Equivalent to --link-journal=host\n"
136 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
138 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
139 program_invocation_short_name);
144 static int parse_argv(int argc, char *argv[]) {
157 static const struct option options[] = {
158 { "help", no_argument, NULL, 'h' },
159 { "version", no_argument, NULL, ARG_VERSION },
160 { "directory", required_argument, NULL, 'D' },
161 { "user", required_argument, NULL, 'u' },
162 { "controllers", required_argument, NULL, 'C' },
163 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
164 { "boot", no_argument, NULL, 'b' },
165 { "uuid", required_argument, NULL, ARG_UUID },
166 { "read-only", no_argument, NULL, ARG_READ_ONLY },
167 { "capability", required_argument, NULL, ARG_CAPABILITY },
168 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
169 { "bind", required_argument, NULL, ARG_BIND },
170 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
171 { "machine", required_argument, NULL, 'M' },
180 while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
189 puts(PACKAGE_STRING);
190 puts(SYSTEMD_FEATURES);
195 arg_directory = canonicalize_file_name(optarg);
196 if (!arg_directory) {
197 log_error("Failed to canonicalize root directory.");
205 arg_user = strdup(optarg);
212 strv_free(arg_controllers);
213 arg_controllers = strv_split(optarg, ",");
214 if (!arg_controllers)
217 cg_shorten_controllers(arg_controllers);
220 case ARG_PRIVATE_NETWORK:
221 arg_private_network = true;
229 if (!id128_is_valid(optarg)) {
230 log_error("Invalid UUID: %s", optarg);
238 if (!hostname_is_valid(optarg)) {
239 log_error("Invalid machine name: %s", optarg);
244 arg_machine = strdup(optarg);
251 arg_read_only = true;
254 case ARG_CAPABILITY: {
258 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
262 t = strndup(word, length);
266 if (cap_from_name(t, &cap) < 0) {
267 log_error("Failed to parse capability %s.", t);
273 arg_retain |= 1ULL << (uint64_t) cap;
280 arg_link_journal = LINK_GUEST;
283 case ARG_LINK_JOURNAL:
284 if (streq(optarg, "auto"))
285 arg_link_journal = LINK_AUTO;
286 else if (streq(optarg, "no"))
287 arg_link_journal = LINK_NO;
288 else if (streq(optarg, "guest"))
289 arg_link_journal = LINK_GUEST;
290 else if (streq(optarg, "host"))
291 arg_link_journal = LINK_HOST;
293 log_error("Failed to parse link journal mode %s", optarg);
301 _cleanup_free_ char *a = NULL, *b = NULL;
306 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
308 e = strchr(optarg, ':');
310 a = strndup(optarg, e - optarg);
320 if (!path_is_absolute(a) || !path_is_absolute(b)) {
321 log_error("Invalid bind mount specification: %s", optarg);
325 r = strv_extend(x, a);
329 r = strv_extend(x, b);
340 log_error("Unknown option code %c", c);
348 static int mount_all(const char *dest) {
350 typedef struct MountPoint {
359 static const MountPoint mount_table[] = {
360 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
361 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
362 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
363 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
364 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
365 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
366 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
367 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
369 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
370 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
377 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
378 _cleanup_free_ char *where = NULL;
381 where = strjoin(dest, "/", mount_table[k].where, NULL);
385 t = path_is_mount_point(where, true);
387 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
395 /* Skip this entry if it is not a remount. */
396 if (mount_table[k].what && t > 0)
399 mkdir_p(where, 0755);
401 if (mount(mount_table[k].what,
404 mount_table[k].flags,
405 mount_table[k].options) < 0 &&
406 mount_table[k].fatal) {
408 log_error("mount(%s) failed: %m", where);
418 static int mount_binds(const char *dest, char **l, unsigned long flags) {
421 STRV_FOREACH_PAIR(x, y, l) {
422 _cleanup_free_ char *where = NULL;
424 where = strjoin(dest, "/", *y, NULL);
428 mkdir_p_label(where, 0755);
430 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
431 log_error("mount(%s) failed: %m", where);
435 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
436 log_error("mount(%s) failed: %m", where);
444 static int setup_timezone(const char *dest) {
445 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
451 /* Fix the timezone, if possible */
452 r = readlink_malloc("/etc/localtime", &p);
454 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
458 z = path_startswith(p, "../usr/share/zoneinfo/");
460 z = path_startswith(p, "/usr/share/zoneinfo/");
462 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
466 where = strappend(dest, "/etc/localtime");
470 r = readlink_malloc(where, &q);
472 y = path_startswith(q, "../usr/share/zoneinfo/");
474 y = path_startswith(q, "/usr/share/zoneinfo/");
477 /* Already pointing to the right place? Then do nothing .. */
478 if (y && streq(y, z))
482 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
486 if (access(check, F_OK) < 0) {
487 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
491 what = strappend("../usr/share/zoneinfo/", z);
496 if (symlink(what, where) < 0) {
497 log_error("Failed to correct timezone of container: %m");
504 static int setup_resolv_conf(const char *dest) {
505 char _cleanup_free_ *where = NULL;
506 _cleanup_close_ int fd = -1;
510 if (arg_private_network)
513 /* Fix resolv.conf, if possible */
514 where = strappend(dest, "/etc/resolv.conf");
518 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
520 /* We don't really care for the results of this really. If it
521 * fails, it fails, but meh... */
522 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
523 log_warning("Failed to bind mount /etc/resolv.conf: %m");
525 if (mount("/etc/resolv.conf", where, "bind",
526 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
527 log_error("Failed to remount /etc/resolv.conf readonly: %m");
534 static int setup_boot_id(const char *dest) {
535 _cleanup_free_ char *from = NULL, *to = NULL;
542 /* Generate a new randomized boot ID, so that each boot-up of
543 * the container gets a new one */
545 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
546 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
550 r = sd_id128_randomize(&rnd);
552 log_error("Failed to generate random boot id: %s", strerror(-r));
556 snprintf(as_uuid, sizeof(as_uuid),
557 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
558 SD_ID128_FORMAT_VAL(rnd));
559 char_array_0(as_uuid);
561 r = write_string_file(from, as_uuid);
563 log_error("Failed to write boot id: %s", strerror(-r));
567 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
568 log_error("Failed to bind mount boot id: %m");
570 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
571 log_warning("Failed to make boot id read-only: %m");
577 static int copy_devnodes(const char *dest) {
579 static const char devnodes[] =
589 _cleanup_umask_ mode_t u;
595 NULSTR_FOREACH(d, devnodes) {
597 _cleanup_free_ char *from = NULL, *to = NULL;
599 asprintf(&from, "/dev/%s", d);
600 asprintf(&to, "%s/dev/%s", dest, d);
611 if (stat(from, &st) < 0) {
613 if (errno != ENOENT) {
614 log_error("Failed to stat %s: %m", from);
619 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
621 log_error("%s is not a char or block device, cannot copy", from);
625 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
627 log_error("mknod(%s) failed: %m", dest);
636 static int setup_ptmx(const char *dest) {
637 _cleanup_free_ char *p = NULL;
639 p = strappend(dest, "/dev/ptmx");
643 if (symlink("pts/ptmx", p) < 0) {
644 log_error("Failed to create /dev/ptmx symlink: %m");
651 static int setup_dev_console(const char *dest, const char *console) {
653 _cleanup_free_ char *to = NULL;
655 _cleanup_umask_ mode_t u;
662 if (stat(console, &st) < 0) {
663 log_error("Failed to stat %s: %m", console);
666 } else if (!S_ISCHR(st.st_mode)) {
667 log_error("/dev/console is not a char device");
671 r = chmod_and_chown(console, 0600, 0, 0);
673 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
677 if (asprintf(&to, "%s/dev/console", dest) < 0)
680 /* We need to bind mount the right tty to /dev/console since
681 * ptys can only exist on pts file systems. To have something
682 * to bind mount things on we create a device node first, that
683 * has the right major/minor (note that the major minor
684 * doesn't actually matter here, since we mount it over
687 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
688 log_error("mknod() for /dev/console failed: %m");
692 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
693 log_error("Bind mount for /dev/console failed: %m");
700 static int setup_kmsg(const char *dest, int kmsg_socket) {
701 _cleanup_free_ char *from = NULL, *to = NULL;
703 _cleanup_umask_ mode_t u;
705 struct cmsghdr cmsghdr;
706 uint8_t buf[CMSG_SPACE(sizeof(int))];
709 .msg_control = &control,
710 .msg_controllen = sizeof(control),
712 struct cmsghdr *cmsg;
715 assert(kmsg_socket >= 0);
719 /* We create the kmsg FIFO as /dev/kmsg, but immediately
720 * delete it after bind mounting it to /proc/kmsg. While FIFOs
721 * on the reading side behave very similar to /proc/kmsg,
722 * their writing side behaves differently from /dev/kmsg in
723 * that writing blocks when nothing is reading. In order to
724 * avoid any problems with containers deadlocking due to this
725 * we simply make /dev/kmsg unavailable to the container. */
726 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
727 asprintf(&to, "%s/proc/kmsg", dest) < 0)
730 if (mkfifo(from, 0600) < 0) {
731 log_error("mkfifo() for /dev/kmsg failed: %m");
735 r = chmod_and_chown(from, 0600, 0, 0);
737 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
741 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
742 log_error("Bind mount for /proc/kmsg failed: %m");
746 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
748 log_error("Failed to open fifo: %m");
752 cmsg = CMSG_FIRSTHDR(&mh);
753 cmsg->cmsg_level = SOL_SOCKET;
754 cmsg->cmsg_type = SCM_RIGHTS;
755 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
756 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
758 mh.msg_controllen = cmsg->cmsg_len;
760 /* Store away the fd in the socket, so that it stays open as
761 * long as we run the child */
762 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
763 close_nointr_nofail(fd);
766 log_error("Failed to send FIFO fd: %m");
770 /* And now make the FIFO unavailable as /dev/kmsg... */
775 static int setup_hostname(void) {
777 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
783 static int setup_journal(const char *directory) {
784 sd_id128_t machine_id;
785 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
789 if (arg_link_journal == LINK_NO)
792 p = strappend(directory, "/etc/machine-id");
796 r = read_one_line_file(p, &b);
797 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
800 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
805 if (isempty(id) && arg_link_journal == LINK_AUTO)
808 /* Verify validity */
809 r = sd_id128_from_string(id, &machine_id);
811 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
816 p = strappend("/var/log/journal/", id);
817 q = strjoin(directory, "/var/log/journal/", id, NULL);
821 if (path_is_mount_point(p, false) > 0) {
822 if (arg_link_journal != LINK_AUTO) {
823 log_error("%s: already a mount point, refusing to use for journal", p);
830 if (path_is_mount_point(q, false) > 0) {
831 if (arg_link_journal != LINK_AUTO) {
832 log_error("%s: already a mount point, refusing to use for journal", q);
839 r = readlink_and_make_absolute(p, &d);
841 if ((arg_link_journal == LINK_GUEST ||
842 arg_link_journal == LINK_AUTO) &&
845 r = mkdir_p(q, 0755);
847 log_warning("failed to create directory %s: %m", q);
852 log_error("Failed to remove symlink %s: %m", p);
855 } else if (r == -EINVAL) {
857 if (arg_link_journal == LINK_GUEST &&
860 if (errno == ENOTDIR) {
861 log_error("%s already exists and is neither a symlink nor a directory", p);
864 log_error("Failed to remove %s: %m", p);
868 } else if (r != -ENOENT) {
869 log_error("readlink(%s) failed: %m", p);
873 if (arg_link_journal == LINK_GUEST) {
875 if (symlink(q, p) < 0) {
876 log_error("Failed to symlink %s to %s: %m", q, p);
880 r = mkdir_p(q, 0755);
882 log_warning("failed to create directory %s: %m", q);
886 if (arg_link_journal == LINK_HOST) {
887 r = mkdir_p(p, 0755);
889 log_error("Failed to create %s: %m", p);
893 } else if (access(p, F_OK) < 0)
896 if (dir_is_empty(q) == 0) {
897 log_error("%s not empty.", q);
901 r = mkdir_p(q, 0755);
903 log_error("Failed to create %s: %m", q);
907 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
908 log_error("Failed to bind mount journal from host into guest: %m");
915 static int setup_cgroup(const char *path) {
919 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
921 log_error("Failed to create cgroup: %s", strerror(-r));
925 STRV_FOREACH(c, arg_controllers) {
926 r = cg_create_and_attach(*c, path, 1);
928 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
934 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
936 _cleanup_free_ char *path = NULL;
937 char buf[DECIMAL_STR_MAX(pid_t)];
942 assert(arg_directory);
944 assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
946 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
948 log_error("Failed to get path: %s", strerror(-r));
952 r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
954 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
957 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
959 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
965 k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
967 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
977 static int drop_capabilities(void) {
978 return capability_bounding_set_drop(~arg_retain, false);
981 static int process_pty(int master, pid_t pid, sigset_t *mask) {
983 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
984 size_t in_buffer_full = 0, out_buffer_full = 0;
985 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
986 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
987 int ep = -1, signal_fd = -1, r;
988 bool tried_orderly_shutdown = false;
994 fd_nonblock(STDIN_FILENO, 1);
995 fd_nonblock(STDOUT_FILENO, 1);
996 fd_nonblock(master, 1);
998 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
1000 log_error("signalfd(): %m");
1005 ep = epoll_create1(EPOLL_CLOEXEC);
1007 log_error("Failed to create epoll: %m");
1012 /* We read from STDIN only if this is actually a TTY,
1013 * otherwise we assume non-interactivity. */
1014 if (isatty(STDIN_FILENO)) {
1016 stdin_ev.events = EPOLLIN|EPOLLET;
1017 stdin_ev.data.fd = STDIN_FILENO;
1019 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1020 log_error("Failed to register STDIN in epoll: %m");
1027 stdout_ev.events = EPOLLOUT|EPOLLET;
1028 stdout_ev.data.fd = STDOUT_FILENO;
1031 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1032 master_ev.data.fd = master;
1035 signal_ev.events = EPOLLIN;
1036 signal_ev.data.fd = signal_fd;
1038 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1039 if (errno != EPERM) {
1040 log_error("Failed to register stdout in epoll: %m");
1044 /* stdout without epoll support. Likely redirected to regular file. */
1045 stdout_writable = true;
1048 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1049 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1050 log_error("Failed to register fds in epoll: %m");
1056 struct epoll_event ev[16];
1060 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1063 if (errno == EINTR || errno == EAGAIN)
1066 log_error("epoll_wait(): %m");
1073 for (i = 0; i < nfds; i++) {
1074 if (ev[i].data.fd == STDIN_FILENO) {
1076 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1077 stdin_readable = true;
1079 } else if (ev[i].data.fd == STDOUT_FILENO) {
1081 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1082 stdout_writable = true;
1084 } else if (ev[i].data.fd == master) {
1086 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1087 master_readable = true;
1089 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1090 master_writable = true;
1092 } else if (ev[i].data.fd == signal_fd) {
1093 struct signalfd_siginfo sfsi;
1096 n = read(signal_fd, &sfsi, sizeof(sfsi));
1097 if (n != sizeof(sfsi)) {
1100 log_error("Failed to read from signalfd: invalid block size");
1105 if (errno != EINTR && errno != EAGAIN) {
1106 log_error("Failed to read from signalfd: %m");
1112 if (sfsi.ssi_signo == SIGWINCH) {
1115 /* The window size changed, let's forward that. */
1116 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1117 ioctl(master, TIOCSWINSZ, &ws);
1118 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1120 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1122 /* This only works for systemd... */
1123 tried_orderly_shutdown = true;
1124 kill(pid, SIGRTMIN+3);
1134 while ((stdin_readable && in_buffer_full <= 0) ||
1135 (master_writable && in_buffer_full > 0) ||
1136 (master_readable && out_buffer_full <= 0) ||
1137 (stdout_writable && out_buffer_full > 0)) {
1139 if (stdin_readable && in_buffer_full < LINE_MAX) {
1141 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1144 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1145 stdin_readable = false;
1147 log_error("read(): %m");
1152 in_buffer_full += (size_t) k;
1155 if (master_writable && in_buffer_full > 0) {
1157 k = write(master, in_buffer, in_buffer_full);
1160 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1161 master_writable = false;
1163 log_error("write(): %m");
1169 assert(in_buffer_full >= (size_t) k);
1170 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1171 in_buffer_full -= k;
1175 if (master_readable && out_buffer_full < LINE_MAX) {
1177 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1180 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1181 master_readable = false;
1183 log_error("read(): %m");
1188 out_buffer_full += (size_t) k;
1191 if (stdout_writable && out_buffer_full > 0) {
1193 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1196 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1197 stdout_writable = false;
1199 log_error("write(): %m");
1205 assert(out_buffer_full >= (size_t) k);
1206 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1207 out_buffer_full -= k;
1215 close_nointr_nofail(ep);
1218 close_nointr_nofail(signal_fd);
1223 static bool audit_enabled(void) {
1226 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1228 close_nointr_nofail(fd);
1234 int main(int argc, char *argv[]) {
1236 int r = EXIT_FAILURE, k;
1237 _cleanup_free_ char *newcg = NULL;
1238 _cleanup_close_ int master = -1;
1240 const char *console = NULL;
1241 struct termios saved_attr, raw_attr;
1243 bool saved_attr_valid = false;
1245 int kmsg_socket_pair[2] = { -1, -1 };
1248 log_parse_environment();
1251 k = parse_argv(argc, argv);
1259 if (arg_directory) {
1262 p = path_make_absolute_cwd(arg_directory);
1263 free(arg_directory);
1266 arg_directory = get_current_dir_name();
1268 if (!arg_directory) {
1269 log_error("Failed to determine path, please use -D.");
1273 path_kill_slashes(arg_directory);
1276 arg_machine = strdup(path_get_file_name(arg_directory));
1282 hostname_cleanup(arg_machine, false);
1283 if (isempty(arg_machine)) {
1284 log_error("Failed to determine machine name automatically, please use -M.");
1289 if (geteuid() != 0) {
1290 log_error("Need to be root.");
1294 if (sd_booted() <= 0) {
1295 log_error("Not running on a systemd system.");
1299 if (arg_boot && audit_enabled()) {
1300 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1301 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1302 "line before using systemd-nspawn. Sleeping for 5s...\n");
1306 if (path_equal(arg_directory, "/")) {
1307 log_error("Spawning container on root directory not supported.");
1311 if (path_is_os_tree(arg_directory) <= 0) {
1312 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1317 n_fd_passed = sd_listen_fds(false);
1318 if (n_fd_passed > 0) {
1319 k = fdset_new_listen_fds(&fds, false);
1321 log_error("Failed to collect file descriptors: %s", strerror(-k));
1325 fdset_close_others(fds);
1328 k = cg_get_machine_path(arg_machine, &newcg);
1330 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1334 k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1335 if (k <= 0 && k != -ENOENT) {
1336 log_error("Container already running.");
1344 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1346 log_error("Failed to acquire pseudo tty: %m");
1350 console = ptsname(master);
1352 log_error("Failed to determine tty name: %m");
1356 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1358 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1359 ioctl(master, TIOCSWINSZ, &ws);
1361 if (unlockpt(master) < 0) {
1362 log_error("Failed to unlock tty: %m");
1366 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1367 saved_attr_valid = true;
1369 raw_attr = saved_attr;
1370 cfmakeraw(&raw_attr);
1371 raw_attr.c_lflag &= ~ECHO;
1374 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1375 log_error("Failed to create kmsg socket pair.");
1379 sd_notify(0, "READY=1");
1381 assert_se(sigemptyset(&mask) == 0);
1382 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1383 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1387 int pipefd[2], pipefd2[2];
1389 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1390 log_error("pipe2(): %m");
1394 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1395 log_error("pipe2(): %m");
1400 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1402 if (errno == EINVAL)
1403 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1405 log_error("clone() failed: %m");
1412 const char *home = NULL;
1413 uid_t uid = (uid_t) -1;
1414 gid_t gid = (gid_t) -1;
1416 const char *envp[] = {
1417 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1418 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1423 NULL, /* container_uuid */
1424 NULL, /* LISTEN_FDS */
1425 NULL, /* LISTEN_PID */
1429 envp[n_env] = strv_find_prefix(environ, "TERM=");
1433 /* Wait for the parent process to log our PID */
1434 close_nointr_nofail(pipefd[1]);
1435 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1436 close_nointr_nofail(pipefd[0]);
1438 close_nointr_nofail(master);
1441 if (saved_attr_valid) {
1442 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1443 log_error("Failed to set terminal attributes: %m");
1448 close_nointr(STDIN_FILENO);
1449 close_nointr(STDOUT_FILENO);
1450 close_nointr(STDERR_FILENO);
1452 close_nointr_nofail(kmsg_socket_pair[0]);
1453 kmsg_socket_pair[0] = -1;
1455 reset_all_signal_handlers();
1457 assert_se(sigemptyset(&mask) == 0);
1458 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1460 k = open_terminal(console, O_RDWR);
1461 if (k != STDIN_FILENO) {
1463 close_nointr_nofail(k);
1467 log_error("Failed to open console: %s", strerror(-k));
1471 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1472 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1473 log_error("Failed to duplicate console: %m");
1478 log_error("setsid() failed: %m");
1482 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1483 log_error("PR_SET_PDEATHSIG failed: %m");
1487 if (setup_cgroup(newcg) < 0)
1490 close_pipe(pipefd2);
1492 /* Mark everything as slave, so that we still
1493 * receive mounts from the real root, but don't
1494 * propagate mounts to the real root. */
1495 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1496 log_error("MS_SLAVE|MS_REC failed: %m");
1500 /* Turn directory into bind mount */
1501 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1502 log_error("Failed to make bind mount.");
1507 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1508 log_error("Failed to make read-only.");
1512 if (mount_all(arg_directory) < 0)
1515 if (copy_devnodes(arg_directory) < 0)
1518 if (setup_ptmx(arg_directory) < 0)
1521 dev_setup(arg_directory);
1523 if (setup_dev_console(arg_directory, console) < 0)
1526 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1529 close_nointr_nofail(kmsg_socket_pair[1]);
1530 kmsg_socket_pair[1] = -1;
1532 if (setup_boot_id(arg_directory) < 0)
1535 if (setup_timezone(arg_directory) < 0)
1538 if (setup_resolv_conf(arg_directory) < 0)
1541 if (setup_journal(arg_directory) < 0)
1544 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1547 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1550 if (chdir(arg_directory) < 0) {
1551 log_error("chdir(%s) failed: %m", arg_directory);
1555 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1556 log_error("mount(MS_MOVE) failed: %m");
1560 if (chroot(".") < 0) {
1561 log_error("chroot() failed: %m");
1565 if (chdir("/") < 0) {
1566 log_error("chdir() failed: %m");
1574 if (drop_capabilities() < 0) {
1575 log_error("drop_capabilities() failed: %m");
1581 /* Note that this resolves user names
1582 * inside the container, and hence
1583 * accesses the NSS modules from the
1584 * container and not the host. This is
1587 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1588 log_error("get_user_creds() failed: %m");
1592 if (mkdir_parents_label(home, 0775) < 0) {
1593 log_error("mkdir_parents_label() failed: %m");
1597 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1598 log_error("mkdir_safe_label() failed: %m");
1602 if (initgroups((const char*)arg_user, gid) < 0) {
1603 log_error("initgroups() failed: %m");
1607 if (setresgid(gid, gid, gid) < 0) {
1608 log_error("setregid() failed: %m");
1612 if (setresuid(uid, uid, uid) < 0) {
1613 log_error("setreuid() failed: %m");
1617 /* Reset everything fully to 0, just in case */
1619 if (setgroups(0, NULL) < 0) {
1620 log_error("setgroups() failed: %m");
1624 if (setresgid(0, 0, 0) < 0) {
1625 log_error("setregid() failed: %m");
1629 if (setresuid(0, 0, 0) < 0) {
1630 log_error("setreuid() failed: %m");
1635 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1636 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1637 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1643 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1649 if (fdset_size(fds) > 0) {
1650 k = fdset_cloexec(fds, false);
1652 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1656 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1657 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1669 /* Automatically search for the init system */
1671 l = 1 + argc - optind;
1672 a = newa(char*, l + 1);
1673 memcpy(a + 1, argv + optind, l * sizeof(char*));
1675 a[0] = (char*) "/usr/lib/systemd/systemd";
1676 execve(a[0], a, (char**) envp);
1678 a[0] = (char*) "/lib/systemd/systemd";
1679 execve(a[0], a, (char**) envp);
1681 a[0] = (char*) "/sbin/init";
1682 execve(a[0], a, (char**) envp);
1683 } else if (argc > optind)
1684 execvpe(argv[optind], argv + optind, (char**) envp);
1686 chdir(home ? home : "/root");
1687 execle("/bin/bash", "-bash", NULL, (char**) envp);
1690 log_error("execv() failed: %m");
1693 _exit(EXIT_FAILURE);
1696 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1697 close_nointr_nofail(pipefd[0]);
1698 close_nointr_nofail(pipefd[1]);
1700 /* Wait for the child process to establish cgroup hierarchy */
1701 close_nointr_nofail(pipefd2[1]);
1702 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1703 close_nointr_nofail(pipefd2[0]);
1705 save_attributes(newcg, pid, arg_uuid, arg_directory);
1710 if (process_pty(master, pid, &mask) < 0)
1713 if (saved_attr_valid)
1714 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1716 k = wait_for_terminate(pid, &status);
1722 if (status.si_code == CLD_EXITED) {
1723 r = status.si_status;
1724 if (status.si_status != 0) {
1725 log_error("Container failed with error code %i.", status.si_status);
1729 log_debug("Container exited successfully.");
1731 } else if (status.si_code == CLD_KILLED &&
1732 status.si_status == SIGINT) {
1733 log_info("Container has been shut down.");
1736 } else if (status.si_code == CLD_KILLED &&
1737 status.si_status == SIGHUP) {
1738 log_info("Container is being rebooted.");
1740 } else if (status.si_code == CLD_KILLED ||
1741 status.si_code == CLD_DUMPED) {
1743 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1747 log_error("Container failed due to unknown reason.");
1754 if (saved_attr_valid)
1755 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1757 close_pipe(kmsg_socket_pair);
1760 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1762 free(arg_directory);
1764 strv_free(arg_controllers);