1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <attr/xattr.h>
26 #include <sys/types.h>
27 #include <sys/syscall.h>
28 #include <sys/mount.h>
34 #include <sys/prctl.h>
35 #include <sys/capability.h>
38 #include <sys/epoll.h>
40 #include <sys/signalfd.h>
44 #include <sys/socket.h>
46 #include <systemd/sd-daemon.h>
54 #include "cgroup-util.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
59 #include "dev-setup.h"
68 typedef enum LinkJournal {
75 static char *arg_directory = NULL;
76 static char *arg_user = NULL;
77 static char **arg_controllers = NULL;
78 static char *arg_uuid = NULL;
79 static char *arg_machine = NULL;
80 static bool arg_private_network = false;
81 static bool arg_read_only = false;
82 static bool arg_boot = false;
83 static LinkJournal arg_link_journal = LINK_AUTO;
84 static uint64_t arg_retain =
86 (1ULL << CAP_DAC_OVERRIDE) |
87 (1ULL << CAP_DAC_READ_SEARCH) |
88 (1ULL << CAP_FOWNER) |
89 (1ULL << CAP_FSETID) |
90 (1ULL << CAP_IPC_OWNER) |
93 (1ULL << CAP_LINUX_IMMUTABLE) |
94 (1ULL << CAP_NET_BIND_SERVICE) |
95 (1ULL << CAP_NET_BROADCAST) |
96 (1ULL << CAP_NET_RAW) |
97 (1ULL << CAP_SETGID) |
98 (1ULL << CAP_SETFCAP) |
99 (1ULL << CAP_SETPCAP) |
100 (1ULL << CAP_SETUID) |
101 (1ULL << CAP_SYS_ADMIN) |
102 (1ULL << CAP_SYS_CHROOT) |
103 (1ULL << CAP_SYS_NICE) |
104 (1ULL << CAP_SYS_PTRACE) |
105 (1ULL << CAP_SYS_TTY_CONFIG) |
106 (1ULL << CAP_SYS_RESOURCE) |
107 (1ULL << CAP_SYS_BOOT) |
108 (1ULL << CAP_AUDIT_WRITE) |
109 (1ULL << CAP_AUDIT_CONTROL);
110 static char **arg_bind = NULL;
111 static char **arg_bind_ro = NULL;
113 static int help(void) {
115 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
116 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
117 " -h --help Show this help\n"
118 " --version Print version string\n"
119 " -D --directory=NAME Root directory for the container\n"
120 " -b --boot Boot up full system (i.e. invoke init)\n"
121 " -u --user=USER Run the command under specified user or uid\n"
122 " -C --controllers=LIST Put the container in specified comma-separated\n"
123 " cgroup hierarchies\n"
124 " --uuid=UUID Set a specific machine UUID for the container\n"
125 " -M --machine=NAME Set the machine name for the container\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135 program_invocation_short_name);
140 static int parse_argv(int argc, char *argv[]) {
153 static const struct option options[] = {
154 { "help", no_argument, NULL, 'h' },
155 { "version", no_argument, NULL, ARG_VERSION },
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "controllers", required_argument, NULL, 'C' },
159 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
160 { "boot", no_argument, NULL, 'b' },
161 { "uuid", required_argument, NULL, ARG_UUID },
162 { "read-only", no_argument, NULL, ARG_READ_ONLY },
163 { "capability", required_argument, NULL, ARG_CAPABILITY },
164 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
165 { "bind", required_argument, NULL, ARG_BIND },
166 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
167 { "machine", required_argument, NULL, 'M' },
176 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
185 puts(PACKAGE_STRING);
186 puts(SYSTEMD_FEATURES);
191 arg_directory = canonicalize_file_name(optarg);
192 if (!arg_directory) {
193 log_error("Failed to canonicalize root directory.");
201 arg_user = strdup(optarg);
208 strv_free(arg_controllers);
209 arg_controllers = strv_split(optarg, ",");
210 if (!arg_controllers)
213 cg_shorten_controllers(arg_controllers);
216 case ARG_PRIVATE_NETWORK:
217 arg_private_network = true;
229 if (!hostname_is_valid(optarg)) {
230 log_error("Invalid machine name: %s", optarg);
235 arg_machine = strdup(optarg);
242 arg_read_only = true;
245 case ARG_CAPABILITY: {
249 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
253 t = strndup(word, length);
257 if (cap_from_name(t, &cap) < 0) {
258 log_error("Failed to parse capability %s.", t);
264 arg_retain |= 1ULL << (uint64_t) cap;
271 arg_link_journal = LINK_GUEST;
274 case ARG_LINK_JOURNAL:
275 if (streq(optarg, "auto"))
276 arg_link_journal = LINK_AUTO;
277 else if (streq(optarg, "no"))
278 arg_link_journal = LINK_NO;
279 else if (streq(optarg, "guest"))
280 arg_link_journal = LINK_GUEST;
281 else if (streq(optarg, "host"))
282 arg_link_journal = LINK_HOST;
284 log_error("Failed to parse link journal mode %s", optarg);
292 _cleanup_free_ char *a = NULL, *b = NULL;
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
299 e = strchr(optarg, ':');
301 a = strndup(optarg, e - optarg);
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
316 r = strv_extend(x, a);
320 r = strv_extend(x, b);
331 log_error("Unknown option code %c", c);
339 static int mount_all(const char *dest) {
341 typedef struct MountPoint {
350 static const MountPoint mount_table[] = {
351 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
352 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
353 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
354 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
355 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
356 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
360 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
361 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
368 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369 _cleanup_free_ char *where = NULL;
372 where = strjoin(dest, "/", mount_table[k].where, NULL);
376 t = path_is_mount_point(where, true);
378 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
386 /* Skip this entry if it is not a remount. */
387 if (mount_table[k].what && t > 0)
390 mkdir_p(where, 0755);
392 if (mount(mount_table[k].what,
395 mount_table[k].flags,
396 mount_table[k].options) < 0 &&
397 mount_table[k].fatal) {
399 log_error("mount(%s) failed: %m", where);
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
412 STRV_FOREACH_PAIR(x, y, l) {
413 _cleanup_free_ char *where = NULL;
415 where = strjoin(dest, "/", *y, NULL);
419 mkdir_p_label(where, 0755);
421 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
422 log_error("mount(%s) failed: %m", where);
426 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
427 log_error("mount(%s) failed: %m", where);
435 static int setup_timezone(const char *dest) {
436 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
442 /* Fix the timezone, if possible */
443 r = readlink_malloc("/etc/localtime", &p);
445 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
449 z = path_startswith(p, "../usr/share/zoneinfo/");
451 z = path_startswith(p, "/usr/share/zoneinfo/");
453 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
457 where = strappend(dest, "/etc/localtime");
461 r = readlink_malloc(where, &q);
463 y = path_startswith(q, "../usr/share/zoneinfo/");
465 y = path_startswith(q, "/usr/share/zoneinfo/");
468 /* Already pointing to the right place? Then do nothing .. */
469 if (y && streq(y, z))
473 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
477 if (access(check, F_OK) < 0) {
478 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
482 what = strappend("../usr/share/zoneinfo/", z);
487 if (symlink(what, where) < 0) {
488 log_error("Failed to correct timezone of container: %m");
495 static int setup_resolv_conf(const char *dest) {
496 char _cleanup_free_ *where = NULL;
497 _cleanup_close_ int fd = -1;
501 if (arg_private_network)
504 /* Fix resolv.conf, if possible */
505 where = strappend(dest, "/etc/resolv.conf");
509 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
511 /* We don't really care for the results of this really. If it
512 * fails, it fails, but meh... */
513 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
514 log_warning("Failed to bind mount /etc/resolv.conf: %m");
516 if (mount("/etc/resolv.conf", where, "bind",
517 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
518 log_error("Failed to remount /etc/resolv.conf readonly: %m");
525 static int setup_boot_id(const char *dest) {
526 _cleanup_free_ char *from = NULL, *to = NULL;
533 /* Generate a new randomized boot ID, so that each boot-up of
534 * the container gets a new one */
536 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
537 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
541 r = sd_id128_randomize(&rnd);
543 log_error("Failed to generate random boot id: %s", strerror(-r));
547 snprintf(as_uuid, sizeof(as_uuid),
548 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
549 SD_ID128_FORMAT_VAL(rnd));
550 char_array_0(as_uuid);
552 r = write_string_file(from, as_uuid);
554 log_error("Failed to write boot id: %s", strerror(-r));
558 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
559 log_error("Failed to bind mount boot id: %m");
561 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
562 log_warning("Failed to make boot id read-only: %m");
568 static int copy_devnodes(const char *dest) {
570 static const char devnodes[] =
580 _cleanup_umask_ mode_t u;
586 NULSTR_FOREACH(d, devnodes) {
588 _cleanup_free_ char *from = NULL, *to = NULL;
590 asprintf(&from, "/dev/%s", d);
591 asprintf(&to, "%s/dev/%s", dest, d);
602 if (stat(from, &st) < 0) {
604 if (errno != ENOENT) {
605 log_error("Failed to stat %s: %m", from);
610 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
612 log_error("%s is not a char or block device, cannot copy", from);
616 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
618 log_error("mknod(%s) failed: %m", dest);
627 static int setup_ptmx(const char *dest) {
628 _cleanup_free_ char *p = NULL;
630 p = strappend(dest, "/dev/ptmx");
634 if (symlink("pts/ptmx", p) < 0) {
635 log_error("Failed to create /dev/ptmx symlink: %m");
642 static int setup_dev_console(const char *dest, const char *console) {
644 _cleanup_free_ char *to = NULL;
646 _cleanup_umask_ mode_t u;
653 if (stat(console, &st) < 0) {
654 log_error("Failed to stat %s: %m", console);
657 } else if (!S_ISCHR(st.st_mode)) {
658 log_error("/dev/console is not a char device");
662 r = chmod_and_chown(console, 0600, 0, 0);
664 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
668 if (asprintf(&to, "%s/dev/console", dest) < 0)
671 /* We need to bind mount the right tty to /dev/console since
672 * ptys can only exist on pts file systems. To have something
673 * to bind mount things on we create a device node first, that
674 * has the right major/minor (note that the major minor
675 * doesn't actually matter here, since we mount it over
678 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
679 log_error("mknod() for /dev/console failed: %m");
683 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
684 log_error("Bind mount for /dev/console failed: %m");
691 static int setup_kmsg(const char *dest, int kmsg_socket) {
692 _cleanup_free_ char *from = NULL, *to = NULL;
694 _cleanup_umask_ mode_t u;
696 struct cmsghdr cmsghdr;
697 uint8_t buf[CMSG_SPACE(sizeof(int))];
700 .msg_control = &control,
701 .msg_controllen = sizeof(control),
703 struct cmsghdr *cmsg;
706 assert(kmsg_socket >= 0);
710 /* We create the kmsg FIFO as /dev/kmsg, but immediately
711 * delete it after bind mounting it to /proc/kmsg. While FIFOs
712 * on the reading side behave very similar to /proc/kmsg,
713 * their writing side behaves differently from /dev/kmsg in
714 * that writing blocks when nothing is reading. In order to
715 * avoid any problems with containers deadlocking due to this
716 * we simply make /dev/kmsg unavailable to the container. */
717 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
718 asprintf(&to, "%s/proc/kmsg", dest) < 0)
721 if (mkfifo(from, 0600) < 0) {
722 log_error("mkfifo() for /dev/kmsg failed: %m");
726 r = chmod_and_chown(from, 0600, 0, 0);
728 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
732 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
733 log_error("Bind mount for /proc/kmsg failed: %m");
737 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
739 log_error("Failed to open fifo: %m");
743 cmsg = CMSG_FIRSTHDR(&mh);
744 cmsg->cmsg_level = SOL_SOCKET;
745 cmsg->cmsg_type = SCM_RIGHTS;
746 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
747 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
749 mh.msg_controllen = cmsg->cmsg_len;
751 /* Store away the fd in the socket, so that it stays open as
752 * long as we run the child */
753 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
754 close_nointr_nofail(fd);
757 log_error("Failed to send FIFO fd: %m");
761 /* And now make the FIFO unavailable as /dev/kmsg... */
766 static int setup_hostname(void) {
768 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
774 static int setup_journal(const char *directory) {
775 sd_id128_t machine_id;
776 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
780 if (arg_link_journal == LINK_NO)
783 p = strappend(directory, "/etc/machine-id");
787 r = read_one_line_file(p, &b);
788 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
791 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
796 if (isempty(id) && arg_link_journal == LINK_AUTO)
799 /* Verify validity */
800 r = sd_id128_from_string(id, &machine_id);
802 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
807 p = strappend("/var/log/journal/", id);
808 q = strjoin(directory, "/var/log/journal/", id, NULL);
812 if (path_is_mount_point(p, false) > 0) {
813 if (arg_link_journal != LINK_AUTO) {
814 log_error("%s: already a mount point, refusing to use for journal", p);
821 if (path_is_mount_point(q, false) > 0) {
822 if (arg_link_journal != LINK_AUTO) {
823 log_error("%s: already a mount point, refusing to use for journal", q);
830 r = readlink_and_make_absolute(p, &d);
832 if ((arg_link_journal == LINK_GUEST ||
833 arg_link_journal == LINK_AUTO) &&
836 r = mkdir_p(q, 0755);
838 log_warning("failed to create directory %s: %m", q);
843 log_error("Failed to remove symlink %s: %m", p);
846 } else if (r == -EINVAL) {
848 if (arg_link_journal == LINK_GUEST &&
851 if (errno == ENOTDIR) {
852 log_error("%s already exists and is neither a symlink nor a directory", p);
855 log_error("Failed to remove %s: %m", p);
859 } else if (r != -ENOENT) {
860 log_error("readlink(%s) failed: %m", p);
864 if (arg_link_journal == LINK_GUEST) {
866 if (symlink(q, p) < 0) {
867 log_error("Failed to symlink %s to %s: %m", q, p);
871 r = mkdir_p(q, 0755);
873 log_warning("failed to create directory %s: %m", q);
877 if (arg_link_journal == LINK_HOST) {
878 r = mkdir_p(p, 0755);
880 log_error("Failed to create %s: %m", p);
884 } else if (access(p, F_OK) < 0)
887 if (dir_is_empty(q) == 0) {
888 log_error("%s not empty.", q);
892 r = mkdir_p(q, 0755);
894 log_error("Failed to create %s: %m", q);
898 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
899 log_error("Failed to bind mount journal from host into guest: %m");
906 static int setup_cgroup(const char *path) {
910 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
912 log_error("Failed to create cgroup: %s", strerror(-r));
916 STRV_FOREACH(c, arg_controllers) {
917 r = cg_create_and_attach(*c, path, 1);
919 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
925 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
926 char buf[DECIMAL_STR_MAX(pid_t)], path[PATH_MAX];
931 assert(arg_directory);
934 assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
936 r = snprintf(path, sizeof(path), "/sys/fs/cgroup/systemd/%s", cgroup);
937 if (r >= (int) sizeof(path)) {
938 log_error("cgroup name too long");
942 r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
944 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
947 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
949 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
955 k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
957 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
965 static int drop_capabilities(void) {
966 return capability_bounding_set_drop(~arg_retain, false);
969 static int process_pty(int master, pid_t pid, sigset_t *mask) {
971 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
972 size_t in_buffer_full = 0, out_buffer_full = 0;
973 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
974 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
975 int ep = -1, signal_fd = -1, r;
976 bool tried_orderly_shutdown = false;
982 fd_nonblock(STDIN_FILENO, 1);
983 fd_nonblock(STDOUT_FILENO, 1);
984 fd_nonblock(master, 1);
986 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
988 log_error("signalfd(): %m");
993 ep = epoll_create1(EPOLL_CLOEXEC);
995 log_error("Failed to create epoll: %m");
1000 /* We read from STDIN only if this is actually a TTY,
1001 * otherwise we assume non-interactivity. */
1002 if (isatty(STDIN_FILENO)) {
1004 stdin_ev.events = EPOLLIN|EPOLLET;
1005 stdin_ev.data.fd = STDIN_FILENO;
1007 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1008 log_error("Failed to register STDIN in epoll: %m");
1015 stdout_ev.events = EPOLLOUT|EPOLLET;
1016 stdout_ev.data.fd = STDOUT_FILENO;
1019 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1020 master_ev.data.fd = master;
1023 signal_ev.events = EPOLLIN;
1024 signal_ev.data.fd = signal_fd;
1026 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1027 if (errno != EPERM) {
1028 log_error("Failed to register stdout in epoll: %m");
1032 /* stdout without epoll support. Likely redirected to regular file. */
1033 stdout_writable = true;
1036 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1037 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1038 log_error("Failed to register fds in epoll: %m");
1044 struct epoll_event ev[16];
1048 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1051 if (errno == EINTR || errno == EAGAIN)
1054 log_error("epoll_wait(): %m");
1061 for (i = 0; i < nfds; i++) {
1062 if (ev[i].data.fd == STDIN_FILENO) {
1064 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1065 stdin_readable = true;
1067 } else if (ev[i].data.fd == STDOUT_FILENO) {
1069 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1070 stdout_writable = true;
1072 } else if (ev[i].data.fd == master) {
1074 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1075 master_readable = true;
1077 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1078 master_writable = true;
1080 } else if (ev[i].data.fd == signal_fd) {
1081 struct signalfd_siginfo sfsi;
1084 n = read(signal_fd, &sfsi, sizeof(sfsi));
1085 if (n != sizeof(sfsi)) {
1088 log_error("Failed to read from signalfd: invalid block size");
1093 if (errno != EINTR && errno != EAGAIN) {
1094 log_error("Failed to read from signalfd: %m");
1100 if (sfsi.ssi_signo == SIGWINCH) {
1103 /* The window size changed, let's forward that. */
1104 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1105 ioctl(master, TIOCSWINSZ, &ws);
1106 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1108 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1110 /* This only works for systemd... */
1111 tried_orderly_shutdown = true;
1112 kill(pid, SIGRTMIN+3);
1122 while ((stdin_readable && in_buffer_full <= 0) ||
1123 (master_writable && in_buffer_full > 0) ||
1124 (master_readable && out_buffer_full <= 0) ||
1125 (stdout_writable && out_buffer_full > 0)) {
1127 if (stdin_readable && in_buffer_full < LINE_MAX) {
1129 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1132 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1133 stdin_readable = false;
1135 log_error("read(): %m");
1140 in_buffer_full += (size_t) k;
1143 if (master_writable && in_buffer_full > 0) {
1145 k = write(master, in_buffer, in_buffer_full);
1148 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1149 master_writable = false;
1151 log_error("write(): %m");
1157 assert(in_buffer_full >= (size_t) k);
1158 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1159 in_buffer_full -= k;
1163 if (master_readable && out_buffer_full < LINE_MAX) {
1165 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1168 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1169 master_readable = false;
1171 log_error("read(): %m");
1176 out_buffer_full += (size_t) k;
1179 if (stdout_writable && out_buffer_full > 0) {
1181 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1184 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1185 stdout_writable = false;
1187 log_error("write(): %m");
1193 assert(out_buffer_full >= (size_t) k);
1194 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1195 out_buffer_full -= k;
1203 close_nointr_nofail(ep);
1206 close_nointr_nofail(signal_fd);
1211 int main(int argc, char *argv[]) {
1213 int r = EXIT_FAILURE, k;
1214 _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1215 _cleanup_close_ int master = -1;
1217 const char *console = NULL;
1218 struct termios saved_attr, raw_attr;
1220 bool saved_attr_valid = false;
1222 int kmsg_socket_pair[2] = { -1, -1 };
1225 log_parse_environment();
1228 r = parse_argv(argc, argv);
1232 if (arg_directory) {
1235 p = path_make_absolute_cwd(arg_directory);
1236 free(arg_directory);
1239 arg_directory = get_current_dir_name();
1241 if (!arg_directory) {
1242 log_error("Failed to determine path, please use -D.");
1246 path_kill_slashes(arg_directory);
1249 arg_machine = strdup(path_get_file_name(arg_directory));
1255 hostname_cleanup(arg_machine);
1256 if (isempty(arg_machine)) {
1257 log_error("Failed to determine machine name automatically, please use -M.");
1262 if (geteuid() != 0) {
1263 log_error("Need to be root.");
1267 if (sd_booted() <= 0) {
1268 log_error("Not running on a systemd system.");
1272 if (path_equal(arg_directory, "/")) {
1273 log_error("Spawning container on root directory not supported.");
1277 if (path_is_os_tree(arg_directory) <= 0) {
1278 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1283 n_fd_passed = sd_listen_fds(false);
1284 if (n_fd_passed > 0) {
1285 k = fdset_new_listen_fds(&fds, false);
1287 log_error("Failed to collect file descriptors: %s", strerror(-k));
1291 fdset_close_others(fds);
1294 k = cg_get_machine_path(&machine_root);
1296 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1300 newcg = strjoin(machine_root, "/", arg_machine, NULL);
1302 log_error("Failed to allocate cgroup path.");
1306 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1307 if (r <= 0 && r != -ENOENT) {
1308 log_error("Container already running.");
1316 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1318 log_error("Failed to acquire pseudo tty: %m");
1322 console = ptsname(master);
1324 log_error("Failed to determine tty name: %m");
1328 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1330 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1331 ioctl(master, TIOCSWINSZ, &ws);
1333 if (unlockpt(master) < 0) {
1334 log_error("Failed to unlock tty: %m");
1338 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1339 saved_attr_valid = true;
1341 raw_attr = saved_attr;
1342 cfmakeraw(&raw_attr);
1343 raw_attr.c_lflag &= ~ECHO;
1346 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1347 log_error("Failed to create kmsg socket pair.");
1351 assert_se(sigemptyset(&mask) == 0);
1352 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1353 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1357 int pipefd[2], pipefd2[2];
1359 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1360 log_error("pipe2(): %m");
1364 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1365 log_error("pipe2(): %m");
1370 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1372 if (errno == EINVAL)
1373 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1375 log_error("clone() failed: %m");
1382 const char *home = NULL;
1383 uid_t uid = (uid_t) -1;
1384 gid_t gid = (gid_t) -1;
1386 const char *envp[] = {
1387 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1388 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1393 NULL, /* container_uuid */
1394 NULL, /* LISTEN_FDS */
1395 NULL, /* LISTEN_PID */
1399 envp[n_env] = strv_find_prefix(environ, "TERM=");
1403 /* Wait for the parent process to log our PID */
1404 close_nointr_nofail(pipefd[1]);
1405 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1406 close_nointr_nofail(pipefd[0]);
1408 close_nointr_nofail(master);
1411 if (saved_attr_valid) {
1412 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1413 log_error("Failed to set terminal attributes: %m");
1418 close_nointr(STDIN_FILENO);
1419 close_nointr(STDOUT_FILENO);
1420 close_nointr(STDERR_FILENO);
1422 close_nointr_nofail(kmsg_socket_pair[0]);
1423 kmsg_socket_pair[0] = -1;
1425 reset_all_signal_handlers();
1427 assert_se(sigemptyset(&mask) == 0);
1428 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1430 k = open_terminal(console, O_RDWR);
1431 if (k != STDIN_FILENO) {
1433 close_nointr_nofail(k);
1437 log_error("Failed to open console: %s", strerror(-k));
1441 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1442 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1443 log_error("Failed to duplicate console: %m");
1448 log_error("setsid() failed: %m");
1452 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1453 log_error("PR_SET_PDEATHSIG failed: %m");
1457 if (setup_cgroup(newcg) < 0)
1460 close_nointr_nofail(pipefd2[1]);
1461 close_nointr_nofail(pipefd2[0]);
1463 /* Mark everything as slave, so that we still
1464 * receive mounts from the real root, but don't
1465 * propagate mounts to the real root. */
1466 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1467 log_error("MS_SLAVE|MS_REC failed: %m");
1471 /* Turn directory into bind mount */
1472 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1473 log_error("Failed to make bind mount.");
1478 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1479 log_error("Failed to make read-only.");
1483 if (mount_all(arg_directory) < 0)
1486 if (copy_devnodes(arg_directory) < 0)
1489 if (setup_ptmx(arg_directory) < 0)
1492 dev_setup(arg_directory);
1494 if (setup_dev_console(arg_directory, console) < 0)
1497 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1500 close_nointr_nofail(kmsg_socket_pair[1]);
1501 kmsg_socket_pair[1] = -1;
1503 if (setup_boot_id(arg_directory) < 0)
1506 if (setup_timezone(arg_directory) < 0)
1509 if (setup_resolv_conf(arg_directory) < 0)
1512 if (setup_journal(arg_directory) < 0)
1515 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1518 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1521 if (chdir(arg_directory) < 0) {
1522 log_error("chdir(%s) failed: %m", arg_directory);
1526 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1527 log_error("mount(MS_MOVE) failed: %m");
1531 if (chroot(".") < 0) {
1532 log_error("chroot() failed: %m");
1536 if (chdir("/") < 0) {
1537 log_error("chdir() failed: %m");
1545 if (drop_capabilities() < 0) {
1546 log_error("drop_capabilities() failed: %m");
1552 /* Note that this resolves user names
1553 * inside the container, and hence
1554 * accesses the NSS modules from the
1555 * container and not the host. This is
1558 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1559 log_error("get_user_creds() failed: %m");
1563 if (mkdir_parents_label(home, 0775) < 0) {
1564 log_error("mkdir_parents_label() failed: %m");
1568 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1569 log_error("mkdir_safe_label() failed: %m");
1573 if (initgroups((const char*)arg_user, gid) < 0) {
1574 log_error("initgroups() failed: %m");
1578 if (setresgid(gid, gid, gid) < 0) {
1579 log_error("setregid() failed: %m");
1583 if (setresuid(uid, uid, uid) < 0) {
1584 log_error("setreuid() failed: %m");
1588 /* Reset everything fully to 0, just in case */
1590 if (setgroups(0, NULL) < 0) {
1591 log_error("setgroups() failed: %m");
1595 if (setresgid(0, 0, 0) < 0) {
1596 log_error("setregid() failed: %m");
1600 if (setresuid(0, 0, 0) < 0) {
1601 log_error("setreuid() failed: %m");
1606 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1607 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1608 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1614 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1620 if (fdset_size(fds) > 0) {
1621 k = fdset_cloexec(fds, false);
1623 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1627 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1628 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1640 /* Automatically search for the init system */
1642 l = 1 + argc - optind;
1643 a = newa(char*, l + 1);
1644 memcpy(a + 1, argv + optind, l * sizeof(char*));
1646 a[0] = (char*) "/usr/lib/systemd/systemd";
1647 execve(a[0], a, (char**) envp);
1649 a[0] = (char*) "/lib/systemd/systemd";
1650 execve(a[0], a, (char**) envp);
1652 a[0] = (char*) "/sbin/init";
1653 execve(a[0], a, (char**) envp);
1654 } else if (argc > optind)
1655 execvpe(argv[optind], argv + optind, (char**) envp);
1657 chdir(home ? home : "/root");
1658 execle("/bin/bash", "-bash", NULL, (char**) envp);
1661 log_error("execv() failed: %m");
1664 _exit(EXIT_FAILURE);
1667 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1668 close_nointr_nofail(pipefd[0]);
1669 close_nointr_nofail(pipefd[1]);
1671 /* Wait for the child process to establish cgroup hierarchy */
1672 close_nointr_nofail(pipefd2[1]);
1673 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1674 close_nointr_nofail(pipefd2[0]);
1676 save_attributes(newcg, pid, arg_uuid, arg_directory);
1681 if (process_pty(master, pid, &mask) < 0)
1684 if (saved_attr_valid)
1685 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1687 r = wait_for_terminate(pid, &status);
1693 if (status.si_code == CLD_EXITED) {
1694 if (status.si_status != 0) {
1695 log_error("Container failed with error code %i.", status.si_status);
1696 r = status.si_status;
1700 log_debug("Container exited successfully.");
1702 } else if (status.si_code == CLD_KILLED &&
1703 status.si_status == SIGINT) {
1704 log_info("Container has been shut down.");
1707 } else if (status.si_code == CLD_KILLED &&
1708 status.si_status == SIGHUP) {
1709 log_info("Container is being rebooted.");
1711 } else if (status.si_code == CLD_KILLED ||
1712 status.si_code == CLD_DUMPED) {
1714 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1718 log_error("Container failed due to unknown reason.");
1725 if (saved_attr_valid)
1726 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1728 close_pipe(kmsg_socket_pair);
1731 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1733 free(arg_directory);
1735 strv_free(arg_controllers);