1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <attr/xattr.h>
26 #include <sys/types.h>
27 #include <sys/syscall.h>
28 #include <sys/mount.h>
34 #include <sys/prctl.h>
35 #include <sys/capability.h>
38 #include <sys/epoll.h>
40 #include <sys/signalfd.h>
44 #include <sys/socket.h>
46 #include <systemd/sd-daemon.h>
54 #include "cgroup-util.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
59 #include "dev-setup.h"
68 typedef enum LinkJournal {
75 static char *arg_directory = NULL;
76 static char *arg_user = NULL;
77 static char **arg_controllers = NULL;
78 static char *arg_uuid = NULL;
79 static char *arg_machine = NULL;
80 static bool arg_private_network = false;
81 static bool arg_read_only = false;
82 static bool arg_boot = false;
83 static LinkJournal arg_link_journal = LINK_AUTO;
84 static uint64_t arg_retain =
86 (1ULL << CAP_DAC_OVERRIDE) |
87 (1ULL << CAP_DAC_READ_SEARCH) |
88 (1ULL << CAP_FOWNER) |
89 (1ULL << CAP_FSETID) |
90 (1ULL << CAP_IPC_OWNER) |
93 (1ULL << CAP_LINUX_IMMUTABLE) |
94 (1ULL << CAP_NET_BIND_SERVICE) |
95 (1ULL << CAP_NET_BROADCAST) |
96 (1ULL << CAP_NET_RAW) |
97 (1ULL << CAP_SETGID) |
98 (1ULL << CAP_SETFCAP) |
99 (1ULL << CAP_SETPCAP) |
100 (1ULL << CAP_SETUID) |
101 (1ULL << CAP_SYS_ADMIN) |
102 (1ULL << CAP_SYS_CHROOT) |
103 (1ULL << CAP_SYS_NICE) |
104 (1ULL << CAP_SYS_PTRACE) |
105 (1ULL << CAP_SYS_TTY_CONFIG) |
106 (1ULL << CAP_SYS_RESOURCE) |
107 (1ULL << CAP_SYS_BOOT) |
108 (1ULL << CAP_AUDIT_WRITE) |
109 (1ULL << CAP_AUDIT_CONTROL);
110 static char **arg_bind = NULL;
111 static char **arg_bind_ro = NULL;
113 static int help(void) {
115 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
116 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
117 " -h --help Show this help\n"
118 " --version Print version string\n"
119 " -D --directory=NAME Root directory for the container\n"
120 " -b --boot Boot up full system (i.e. invoke init)\n"
121 " -u --user=USER Run the command under specified user or uid\n"
122 " -C --controllers=LIST Put the container in specified comma-separated\n"
123 " cgroup hierarchies\n"
124 " --uuid=UUID Set a specific machine UUID for the container\n"
125 " -M --machine=NAME Set the machine name for the container\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135 program_invocation_short_name);
140 static int parse_argv(int argc, char *argv[]) {
153 static const struct option options[] = {
154 { "help", no_argument, NULL, 'h' },
155 { "version", no_argument, NULL, ARG_VERSION },
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "controllers", required_argument, NULL, 'C' },
159 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
160 { "boot", no_argument, NULL, 'b' },
161 { "uuid", required_argument, NULL, ARG_UUID },
162 { "read-only", no_argument, NULL, ARG_READ_ONLY },
163 { "capability", required_argument, NULL, ARG_CAPABILITY },
164 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
165 { "bind", required_argument, NULL, ARG_BIND },
166 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
167 { "machine", required_argument, NULL, 'M' },
176 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
185 puts(PACKAGE_STRING);
186 puts(SYSTEMD_FEATURES);
191 arg_directory = canonicalize_file_name(optarg);
192 if (!arg_directory) {
193 log_error("Failed to canonicalize root directory.");
201 arg_user = strdup(optarg);
208 strv_free(arg_controllers);
209 arg_controllers = strv_split(optarg, ",");
210 if (!arg_controllers)
213 cg_shorten_controllers(arg_controllers);
216 case ARG_PRIVATE_NETWORK:
217 arg_private_network = true;
229 if (!hostname_is_valid(optarg)) {
230 log_error("Invalid machine name: %s", optarg);
235 arg_machine = strdup(optarg);
242 arg_read_only = true;
245 case ARG_CAPABILITY: {
249 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
253 t = strndup(word, length);
257 if (cap_from_name(t, &cap) < 0) {
258 log_error("Failed to parse capability %s.", t);
264 arg_retain |= 1ULL << (uint64_t) cap;
271 arg_link_journal = LINK_GUEST;
274 case ARG_LINK_JOURNAL:
275 if (streq(optarg, "auto"))
276 arg_link_journal = LINK_AUTO;
277 else if (streq(optarg, "no"))
278 arg_link_journal = LINK_NO;
279 else if (streq(optarg, "guest"))
280 arg_link_journal = LINK_GUEST;
281 else if (streq(optarg, "host"))
282 arg_link_journal = LINK_HOST;
284 log_error("Failed to parse link journal mode %s", optarg);
292 _cleanup_free_ char *a = NULL, *b = NULL;
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
299 e = strchr(optarg, ':');
301 a = strndup(optarg, e - optarg);
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
316 r = strv_extend(x, a);
320 r = strv_extend(x, b);
331 log_error("Unknown option code %c", c);
339 static int mount_all(const char *dest) {
341 typedef struct MountPoint {
350 static const MountPoint mount_table[] = {
351 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
352 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
353 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
354 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
355 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
356 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
360 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
361 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
368 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369 _cleanup_free_ char *where = NULL;
372 where = strjoin(dest, "/", mount_table[k].where, NULL);
376 t = path_is_mount_point(where, true);
378 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
386 /* Skip this entry if it is not a remount. */
387 if (mount_table[k].what && t > 0)
390 mkdir_p(where, 0755);
392 if (mount(mount_table[k].what,
395 mount_table[k].flags,
396 mount_table[k].options) < 0 &&
397 mount_table[k].fatal) {
399 log_error("mount(%s) failed: %m", where);
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
412 STRV_FOREACH_PAIR(x, y, l) {
413 _cleanup_free_ char *where = NULL;
415 where = strjoin(dest, "/", *y, NULL);
419 mkdir_p_label(where, 0755);
421 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
422 log_error("mount(%s) failed: %m", where);
426 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
427 log_error("mount(%s) failed: %m", where);
435 static int setup_timezone(const char *dest) {
436 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
442 /* Fix the timezone, if possible */
443 r = readlink_malloc("/etc/localtime", &p);
445 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
449 z = path_startswith(p, "../usr/share/zoneinfo/");
451 z = path_startswith(p, "/usr/share/zoneinfo/");
453 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
457 where = strappend(dest, "/etc/localtime");
461 r = readlink_malloc(where, &q);
463 y = path_startswith(q, "../usr/share/zoneinfo/");
465 y = path_startswith(q, "/usr/share/zoneinfo/");
468 /* Already pointing to the right place? Then do nothing .. */
469 if (y && streq(y, z))
473 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
477 if (access(check, F_OK) < 0) {
478 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
482 what = strappend("../usr/share/zoneinfo/", z);
487 if (symlink(what, where) < 0) {
488 log_error("Failed to correct timezone of container: %m");
495 static int setup_resolv_conf(const char *dest) {
496 char _cleanup_free_ *where = NULL;
497 _cleanup_close_ int fd = -1;
501 if (arg_private_network)
504 /* Fix resolv.conf, if possible */
505 where = strappend(dest, "/etc/resolv.conf");
509 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
511 /* We don't really care for the results of this really. If it
512 * fails, it fails, but meh... */
513 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
514 log_warning("Failed to bind mount /etc/resolv.conf: %m");
516 if (mount("/etc/resolv.conf", where, "bind",
517 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
518 log_error("Failed to remount /etc/resolv.conf readonly: %m");
525 static int setup_boot_id(const char *dest) {
526 _cleanup_free_ char *from = NULL, *to = NULL;
533 /* Generate a new randomized boot ID, so that each boot-up of
534 * the container gets a new one */
536 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
537 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
541 r = sd_id128_randomize(&rnd);
543 log_error("Failed to generate random boot id: %s", strerror(-r));
547 snprintf(as_uuid, sizeof(as_uuid),
548 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
549 SD_ID128_FORMAT_VAL(rnd));
550 char_array_0(as_uuid);
552 r = write_string_file(from, as_uuid);
554 log_error("Failed to write boot id: %s", strerror(-r));
558 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
559 log_error("Failed to bind mount boot id: %m");
561 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
562 log_warning("Failed to make boot id read-only: %m");
568 static int copy_devnodes(const char *dest) {
570 static const char devnodes[] =
580 _cleanup_umask_ mode_t u;
586 NULSTR_FOREACH(d, devnodes) {
588 _cleanup_free_ char *from = NULL, *to = NULL;
590 asprintf(&from, "/dev/%s", d);
591 asprintf(&to, "%s/dev/%s", dest, d);
602 if (stat(from, &st) < 0) {
604 if (errno != ENOENT) {
605 log_error("Failed to stat %s: %m", from);
610 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
612 log_error("%s is not a char or block device, cannot copy", from);
616 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
618 log_error("mknod(%s) failed: %m", dest);
627 static int setup_ptmx(const char *dest) {
628 _cleanup_free_ char *p = NULL;
630 p = strappend(dest, "/dev/ptmx");
634 if (symlink("pts/ptmx", p) < 0) {
635 log_error("Failed to create /dev/ptmx symlink: %m");
642 static int setup_dev_console(const char *dest, const char *console) {
644 _cleanup_free_ char *to = NULL;
646 _cleanup_umask_ mode_t u;
653 if (stat(console, &st) < 0) {
654 log_error("Failed to stat %s: %m", console);
657 } else if (!S_ISCHR(st.st_mode)) {
658 log_error("/dev/console is not a char device");
662 r = chmod_and_chown(console, 0600, 0, 0);
664 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
668 if (asprintf(&to, "%s/dev/console", dest) < 0)
671 /* We need to bind mount the right tty to /dev/console since
672 * ptys can only exist on pts file systems. To have something
673 * to bind mount things on we create a device node first, that
674 * has the right major/minor (note that the major minor
675 * doesn't actually matter here, since we mount it over
678 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
679 log_error("mknod() for /dev/console failed: %m");
683 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
684 log_error("Bind mount for /dev/console failed: %m");
691 static int setup_kmsg(const char *dest, int kmsg_socket) {
692 _cleanup_free_ char *from = NULL, *to = NULL;
694 _cleanup_umask_ mode_t u;
696 struct cmsghdr cmsghdr;
697 uint8_t buf[CMSG_SPACE(sizeof(int))];
700 .msg_control = &control,
701 .msg_controllen = sizeof(control),
703 struct cmsghdr *cmsg;
706 assert(kmsg_socket >= 0);
710 /* We create the kmsg FIFO as /dev/kmsg, but immediately
711 * delete it after bind mounting it to /proc/kmsg. While FIFOs
712 * on the reading side behave very similar to /proc/kmsg,
713 * their writing side behaves differently from /dev/kmsg in
714 * that writing blocks when nothing is reading. In order to
715 * avoid any problems with containers deadlocking due to this
716 * we simply make /dev/kmsg unavailable to the container. */
717 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
718 asprintf(&to, "%s/proc/kmsg", dest) < 0)
721 if (mkfifo(from, 0600) < 0) {
722 log_error("mkfifo() for /dev/kmsg failed: %m");
726 r = chmod_and_chown(from, 0600, 0, 0);
728 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
732 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
733 log_error("Bind mount for /proc/kmsg failed: %m");
737 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
739 log_error("Failed to open fifo: %m");
743 cmsg = CMSG_FIRSTHDR(&mh);
744 cmsg->cmsg_level = SOL_SOCKET;
745 cmsg->cmsg_type = SCM_RIGHTS;
746 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
747 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
749 mh.msg_controllen = cmsg->cmsg_len;
751 /* Store away the fd in the socket, so that it stays open as
752 * long as we run the child */
753 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
754 close_nointr_nofail(fd);
757 log_error("Failed to send FIFO fd: %m");
761 /* And now make the FIFO unavailable as /dev/kmsg... */
766 static int setup_hostname(void) {
768 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
774 static int setup_journal(const char *directory) {
775 sd_id128_t machine_id;
776 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
780 if (arg_link_journal == LINK_NO)
783 p = strappend(directory, "/etc/machine-id");
787 r = read_one_line_file(p, &b);
788 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
791 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
796 if (isempty(id) && arg_link_journal == LINK_AUTO)
799 /* Verify validity */
800 r = sd_id128_from_string(id, &machine_id);
802 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
807 p = strappend("/var/log/journal/", id);
808 q = strjoin(directory, "/var/log/journal/", id, NULL);
812 if (path_is_mount_point(p, false) > 0) {
813 if (arg_link_journal != LINK_AUTO) {
814 log_error("%s: already a mount point, refusing to use for journal", p);
821 if (path_is_mount_point(q, false) > 0) {
822 if (arg_link_journal != LINK_AUTO) {
823 log_error("%s: already a mount point, refusing to use for journal", q);
830 r = readlink_and_make_absolute(p, &d);
832 if ((arg_link_journal == LINK_GUEST ||
833 arg_link_journal == LINK_AUTO) &&
836 r = mkdir_p(q, 0755);
838 log_warning("failed to create directory %s: %m", q);
843 log_error("Failed to remove symlink %s: %m", p);
846 } else if (r == -EINVAL) {
848 if (arg_link_journal == LINK_GUEST &&
851 if (errno == ENOTDIR) {
852 log_error("%s already exists and is neither a symlink nor a directory", p);
855 log_error("Failed to remove %s: %m", p);
859 } else if (r != -ENOENT) {
860 log_error("readlink(%s) failed: %m", p);
864 if (arg_link_journal == LINK_GUEST) {
866 if (symlink(q, p) < 0) {
867 log_error("Failed to symlink %s to %s: %m", q, p);
871 r = mkdir_p(q, 0755);
873 log_warning("failed to create directory %s: %m", q);
877 if (arg_link_journal == LINK_HOST) {
878 r = mkdir_p(p, 0755);
880 log_error("Failed to create %s: %m", p);
884 } else if (access(p, F_OK) < 0)
887 if (dir_is_empty(q) == 0) {
888 log_error("%s not empty.", q);
892 r = mkdir_p(q, 0755);
894 log_error("Failed to create %s: %m", q);
898 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
899 log_error("Failed to bind mount journal from host into guest: %m");
906 static int setup_cgroup(const char *path) {
910 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
912 log_error("Failed to create cgroup: %s", strerror(-r));
916 STRV_FOREACH(c, arg_controllers) {
917 r = cg_create_and_attach(*c, path, 1);
919 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
925 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
926 _cleanup_free_ char *path = NULL;
927 char buf[DECIMAL_STR_MAX(pid_t)];
932 assert(arg_directory);
935 assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
937 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
939 log_error("Failed to get path: %s", strerror(-r));
943 r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
945 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
948 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
950 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
956 k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
958 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
966 static int drop_capabilities(void) {
967 return capability_bounding_set_drop(~arg_retain, false);
970 static int process_pty(int master, pid_t pid, sigset_t *mask) {
972 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
973 size_t in_buffer_full = 0, out_buffer_full = 0;
974 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
975 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
976 int ep = -1, signal_fd = -1, r;
977 bool tried_orderly_shutdown = false;
983 fd_nonblock(STDIN_FILENO, 1);
984 fd_nonblock(STDOUT_FILENO, 1);
985 fd_nonblock(master, 1);
987 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
989 log_error("signalfd(): %m");
994 ep = epoll_create1(EPOLL_CLOEXEC);
996 log_error("Failed to create epoll: %m");
1001 /* We read from STDIN only if this is actually a TTY,
1002 * otherwise we assume non-interactivity. */
1003 if (isatty(STDIN_FILENO)) {
1005 stdin_ev.events = EPOLLIN|EPOLLET;
1006 stdin_ev.data.fd = STDIN_FILENO;
1008 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1009 log_error("Failed to register STDIN in epoll: %m");
1016 stdout_ev.events = EPOLLOUT|EPOLLET;
1017 stdout_ev.data.fd = STDOUT_FILENO;
1020 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1021 master_ev.data.fd = master;
1024 signal_ev.events = EPOLLIN;
1025 signal_ev.data.fd = signal_fd;
1027 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1028 if (errno != EPERM) {
1029 log_error("Failed to register stdout in epoll: %m");
1033 /* stdout without epoll support. Likely redirected to regular file. */
1034 stdout_writable = true;
1037 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1038 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1039 log_error("Failed to register fds in epoll: %m");
1045 struct epoll_event ev[16];
1049 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1052 if (errno == EINTR || errno == EAGAIN)
1055 log_error("epoll_wait(): %m");
1062 for (i = 0; i < nfds; i++) {
1063 if (ev[i].data.fd == STDIN_FILENO) {
1065 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1066 stdin_readable = true;
1068 } else if (ev[i].data.fd == STDOUT_FILENO) {
1070 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1071 stdout_writable = true;
1073 } else if (ev[i].data.fd == master) {
1075 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1076 master_readable = true;
1078 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1079 master_writable = true;
1081 } else if (ev[i].data.fd == signal_fd) {
1082 struct signalfd_siginfo sfsi;
1085 n = read(signal_fd, &sfsi, sizeof(sfsi));
1086 if (n != sizeof(sfsi)) {
1089 log_error("Failed to read from signalfd: invalid block size");
1094 if (errno != EINTR && errno != EAGAIN) {
1095 log_error("Failed to read from signalfd: %m");
1101 if (sfsi.ssi_signo == SIGWINCH) {
1104 /* The window size changed, let's forward that. */
1105 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1106 ioctl(master, TIOCSWINSZ, &ws);
1107 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1109 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1111 /* This only works for systemd... */
1112 tried_orderly_shutdown = true;
1113 kill(pid, SIGRTMIN+3);
1123 while ((stdin_readable && in_buffer_full <= 0) ||
1124 (master_writable && in_buffer_full > 0) ||
1125 (master_readable && out_buffer_full <= 0) ||
1126 (stdout_writable && out_buffer_full > 0)) {
1128 if (stdin_readable && in_buffer_full < LINE_MAX) {
1130 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1133 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1134 stdin_readable = false;
1136 log_error("read(): %m");
1141 in_buffer_full += (size_t) k;
1144 if (master_writable && in_buffer_full > 0) {
1146 k = write(master, in_buffer, in_buffer_full);
1149 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1150 master_writable = false;
1152 log_error("write(): %m");
1158 assert(in_buffer_full >= (size_t) k);
1159 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1160 in_buffer_full -= k;
1164 if (master_readable && out_buffer_full < LINE_MAX) {
1166 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1169 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1170 master_readable = false;
1172 log_error("read(): %m");
1177 out_buffer_full += (size_t) k;
1180 if (stdout_writable && out_buffer_full > 0) {
1182 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1185 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1186 stdout_writable = false;
1188 log_error("write(): %m");
1194 assert(out_buffer_full >= (size_t) k);
1195 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1196 out_buffer_full -= k;
1204 close_nointr_nofail(ep);
1207 close_nointr_nofail(signal_fd);
1212 int main(int argc, char *argv[]) {
1214 int r = EXIT_FAILURE, k;
1215 _cleanup_free_ char *machine_root = NULL, *name = NULL, *escaped = NULL, *newcg = NULL;
1216 _cleanup_close_ int master = -1;
1218 const char *console = NULL;
1219 struct termios saved_attr, raw_attr;
1221 bool saved_attr_valid = false;
1223 int kmsg_socket_pair[2] = { -1, -1 };
1226 log_parse_environment();
1229 r = parse_argv(argc, argv);
1233 if (arg_directory) {
1236 p = path_make_absolute_cwd(arg_directory);
1237 free(arg_directory);
1240 arg_directory = get_current_dir_name();
1242 if (!arg_directory) {
1243 log_error("Failed to determine path, please use -D.");
1247 path_kill_slashes(arg_directory);
1250 arg_machine = strdup(path_get_file_name(arg_directory));
1256 hostname_cleanup(arg_machine);
1257 if (isempty(arg_machine)) {
1258 log_error("Failed to determine machine name automatically, please use -M.");
1263 if (geteuid() != 0) {
1264 log_error("Need to be root.");
1268 if (sd_booted() <= 0) {
1269 log_error("Not running on a systemd system.");
1273 if (path_equal(arg_directory, "/")) {
1274 log_error("Spawning container on root directory not supported.");
1278 if (path_is_os_tree(arg_directory) <= 0) {
1279 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1284 n_fd_passed = sd_listen_fds(false);
1285 if (n_fd_passed > 0) {
1286 k = fdset_new_listen_fds(&fds, false);
1288 log_error("Failed to collect file descriptors: %s", strerror(-k));
1292 fdset_close_others(fds);
1295 k = cg_get_machine_path(&machine_root);
1297 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1301 name = strappend(arg_machine, ".nspawn");
1307 escaped = cg_escape(name);
1313 newcg = strjoin(machine_root, "/", escaped, NULL);
1319 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1320 if (r <= 0 && r != -ENOENT) {
1321 log_error("Container already running.");
1329 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1331 log_error("Failed to acquire pseudo tty: %m");
1335 console = ptsname(master);
1337 log_error("Failed to determine tty name: %m");
1341 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1343 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1344 ioctl(master, TIOCSWINSZ, &ws);
1346 if (unlockpt(master) < 0) {
1347 log_error("Failed to unlock tty: %m");
1351 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1352 saved_attr_valid = true;
1354 raw_attr = saved_attr;
1355 cfmakeraw(&raw_attr);
1356 raw_attr.c_lflag &= ~ECHO;
1359 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1360 log_error("Failed to create kmsg socket pair.");
1364 assert_se(sigemptyset(&mask) == 0);
1365 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1366 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1370 int pipefd[2], pipefd2[2];
1372 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1373 log_error("pipe2(): %m");
1377 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1378 log_error("pipe2(): %m");
1383 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1385 if (errno == EINVAL)
1386 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1388 log_error("clone() failed: %m");
1395 const char *home = NULL;
1396 uid_t uid = (uid_t) -1;
1397 gid_t gid = (gid_t) -1;
1399 const char *envp[] = {
1400 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1401 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1406 NULL, /* container_uuid */
1407 NULL, /* LISTEN_FDS */
1408 NULL, /* LISTEN_PID */
1412 envp[n_env] = strv_find_prefix(environ, "TERM=");
1416 /* Wait for the parent process to log our PID */
1417 close_nointr_nofail(pipefd[1]);
1418 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1419 close_nointr_nofail(pipefd[0]);
1421 close_nointr_nofail(master);
1424 if (saved_attr_valid) {
1425 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1426 log_error("Failed to set terminal attributes: %m");
1431 close_nointr(STDIN_FILENO);
1432 close_nointr(STDOUT_FILENO);
1433 close_nointr(STDERR_FILENO);
1435 close_nointr_nofail(kmsg_socket_pair[0]);
1436 kmsg_socket_pair[0] = -1;
1438 reset_all_signal_handlers();
1440 assert_se(sigemptyset(&mask) == 0);
1441 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1443 k = open_terminal(console, O_RDWR);
1444 if (k != STDIN_FILENO) {
1446 close_nointr_nofail(k);
1450 log_error("Failed to open console: %s", strerror(-k));
1454 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1455 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1456 log_error("Failed to duplicate console: %m");
1461 log_error("setsid() failed: %m");
1465 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1466 log_error("PR_SET_PDEATHSIG failed: %m");
1470 if (setup_cgroup(newcg) < 0)
1473 close_pipe(pipefd2);
1475 /* Mark everything as slave, so that we still
1476 * receive mounts from the real root, but don't
1477 * propagate mounts to the real root. */
1478 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1479 log_error("MS_SLAVE|MS_REC failed: %m");
1483 /* Turn directory into bind mount */
1484 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1485 log_error("Failed to make bind mount.");
1490 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1491 log_error("Failed to make read-only.");
1495 if (mount_all(arg_directory) < 0)
1498 if (copy_devnodes(arg_directory) < 0)
1501 if (setup_ptmx(arg_directory) < 0)
1504 dev_setup(arg_directory);
1506 if (setup_dev_console(arg_directory, console) < 0)
1509 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1512 close_nointr_nofail(kmsg_socket_pair[1]);
1513 kmsg_socket_pair[1] = -1;
1515 if (setup_boot_id(arg_directory) < 0)
1518 if (setup_timezone(arg_directory) < 0)
1521 if (setup_resolv_conf(arg_directory) < 0)
1524 if (setup_journal(arg_directory) < 0)
1527 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1530 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1533 if (chdir(arg_directory) < 0) {
1534 log_error("chdir(%s) failed: %m", arg_directory);
1538 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1539 log_error("mount(MS_MOVE) failed: %m");
1543 if (chroot(".") < 0) {
1544 log_error("chroot() failed: %m");
1548 if (chdir("/") < 0) {
1549 log_error("chdir() failed: %m");
1557 if (drop_capabilities() < 0) {
1558 log_error("drop_capabilities() failed: %m");
1564 /* Note that this resolves user names
1565 * inside the container, and hence
1566 * accesses the NSS modules from the
1567 * container and not the host. This is
1570 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1571 log_error("get_user_creds() failed: %m");
1575 if (mkdir_parents_label(home, 0775) < 0) {
1576 log_error("mkdir_parents_label() failed: %m");
1580 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1581 log_error("mkdir_safe_label() failed: %m");
1585 if (initgroups((const char*)arg_user, gid) < 0) {
1586 log_error("initgroups() failed: %m");
1590 if (setresgid(gid, gid, gid) < 0) {
1591 log_error("setregid() failed: %m");
1595 if (setresuid(uid, uid, uid) < 0) {
1596 log_error("setreuid() failed: %m");
1600 /* Reset everything fully to 0, just in case */
1602 if (setgroups(0, NULL) < 0) {
1603 log_error("setgroups() failed: %m");
1607 if (setresgid(0, 0, 0) < 0) {
1608 log_error("setregid() failed: %m");
1612 if (setresuid(0, 0, 0) < 0) {
1613 log_error("setreuid() failed: %m");
1618 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1619 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1620 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1626 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1632 if (fdset_size(fds) > 0) {
1633 k = fdset_cloexec(fds, false);
1635 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1639 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1640 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1652 /* Automatically search for the init system */
1654 l = 1 + argc - optind;
1655 a = newa(char*, l + 1);
1656 memcpy(a + 1, argv + optind, l * sizeof(char*));
1658 a[0] = (char*) "/usr/lib/systemd/systemd";
1659 execve(a[0], a, (char**) envp);
1661 a[0] = (char*) "/lib/systemd/systemd";
1662 execve(a[0], a, (char**) envp);
1664 a[0] = (char*) "/sbin/init";
1665 execve(a[0], a, (char**) envp);
1666 } else if (argc > optind)
1667 execvpe(argv[optind], argv + optind, (char**) envp);
1669 chdir(home ? home : "/root");
1670 execle("/bin/bash", "-bash", NULL, (char**) envp);
1673 log_error("execv() failed: %m");
1676 _exit(EXIT_FAILURE);
1679 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1680 close_nointr_nofail(pipefd[0]);
1681 close_nointr_nofail(pipefd[1]);
1683 /* Wait for the child process to establish cgroup hierarchy */
1684 close_nointr_nofail(pipefd2[1]);
1685 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1686 close_nointr_nofail(pipefd2[0]);
1688 save_attributes(newcg, pid, arg_uuid, arg_directory);
1693 if (process_pty(master, pid, &mask) < 0)
1696 if (saved_attr_valid)
1697 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1699 r = wait_for_terminate(pid, &status);
1705 if (status.si_code == CLD_EXITED) {
1706 if (status.si_status != 0) {
1707 log_error("Container failed with error code %i.", status.si_status);
1708 r = status.si_status;
1712 log_debug("Container exited successfully.");
1714 } else if (status.si_code == CLD_KILLED &&
1715 status.si_status == SIGINT) {
1716 log_info("Container has been shut down.");
1719 } else if (status.si_code == CLD_KILLED &&
1720 status.si_status == SIGHUP) {
1721 log_info("Container is being rebooted.");
1723 } else if (status.si_code == CLD_KILLED ||
1724 status.si_code == CLD_DUMPED) {
1726 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1730 log_error("Container failed due to unknown reason.");
1737 if (saved_attr_valid)
1738 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1740 close_pipe(kmsg_socket_pair);
1743 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1745 free(arg_directory);
1747 strv_free(arg_controllers);