1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <attr/xattr.h>
26 #include <sys/types.h>
27 #include <sys/syscall.h>
28 #include <sys/mount.h>
34 #include <sys/prctl.h>
35 #include <sys/capability.h>
38 #include <sys/epoll.h>
40 #include <sys/signalfd.h>
44 #include <sys/socket.h>
46 #include <systemd/sd-daemon.h>
54 #include "cgroup-util.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
59 #include "dev-setup.h"
68 typedef enum LinkJournal {
75 static char *arg_directory = NULL;
76 static char *arg_user = NULL;
77 static char **arg_controllers = NULL;
78 static char *arg_uuid = NULL;
79 static char *arg_machine = NULL;
80 static bool arg_private_network = false;
81 static bool arg_read_only = false;
82 static bool arg_boot = false;
83 static LinkJournal arg_link_journal = LINK_AUTO;
84 static uint64_t arg_retain =
86 (1ULL << CAP_DAC_OVERRIDE) |
87 (1ULL << CAP_DAC_READ_SEARCH) |
88 (1ULL << CAP_FOWNER) |
89 (1ULL << CAP_FSETID) |
90 (1ULL << CAP_IPC_OWNER) |
93 (1ULL << CAP_LINUX_IMMUTABLE) |
94 (1ULL << CAP_NET_BIND_SERVICE) |
95 (1ULL << CAP_NET_BROADCAST) |
96 (1ULL << CAP_NET_RAW) |
97 (1ULL << CAP_SETGID) |
98 (1ULL << CAP_SETFCAP) |
99 (1ULL << CAP_SETPCAP) |
100 (1ULL << CAP_SETUID) |
101 (1ULL << CAP_SYS_ADMIN) |
102 (1ULL << CAP_SYS_CHROOT) |
103 (1ULL << CAP_SYS_NICE) |
104 (1ULL << CAP_SYS_PTRACE) |
105 (1ULL << CAP_SYS_TTY_CONFIG) |
106 (1ULL << CAP_SYS_RESOURCE) |
107 (1ULL << CAP_SYS_BOOT) |
108 (1ULL << CAP_AUDIT_WRITE) |
109 (1ULL << CAP_AUDIT_CONTROL);
110 static char **arg_bind = NULL;
111 static char **arg_bind_ro = NULL;
113 static int help(void) {
115 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
116 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
117 " -h --help Show this help\n"
118 " --version Print version string\n"
119 " -D --directory=NAME Root directory for the container\n"
120 " -b --boot Boot up full system (i.e. invoke init)\n"
121 " -u --user=USER Run the command under specified user or uid\n"
122 " -C --controllers=LIST Put the container in specified comma-separated\n"
123 " cgroup hierarchies\n"
124 " --uuid=UUID Set a specific machine UUID for the container\n"
125 " -M --machine=NAME Set the machine name for the container\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135 program_invocation_short_name);
140 static int parse_argv(int argc, char *argv[]) {
153 static const struct option options[] = {
154 { "help", no_argument, NULL, 'h' },
155 { "version", no_argument, NULL, ARG_VERSION },
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "controllers", required_argument, NULL, 'C' },
159 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
160 { "boot", no_argument, NULL, 'b' },
161 { "uuid", required_argument, NULL, ARG_UUID },
162 { "read-only", no_argument, NULL, ARG_READ_ONLY },
163 { "capability", required_argument, NULL, ARG_CAPABILITY },
164 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
165 { "bind", required_argument, NULL, ARG_BIND },
166 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
167 { "machine", required_argument, NULL, 'M' },
176 while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
185 puts(PACKAGE_STRING);
186 puts(SYSTEMD_FEATURES);
191 arg_directory = canonicalize_file_name(optarg);
192 if (!arg_directory) {
193 log_error("Failed to canonicalize root directory.");
201 arg_user = strdup(optarg);
208 strv_free(arg_controllers);
209 arg_controllers = strv_split(optarg, ",");
210 if (!arg_controllers)
213 cg_shorten_controllers(arg_controllers);
216 case ARG_PRIVATE_NETWORK:
217 arg_private_network = true;
225 if (!id128_is_valid(optarg)) {
226 log_error("Invalid UUID: %s", optarg);
234 if (!hostname_is_valid(optarg)) {
235 log_error("Invalid machine name: %s", optarg);
240 arg_machine = strdup(optarg);
247 arg_read_only = true;
250 case ARG_CAPABILITY: {
254 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
258 t = strndup(word, length);
262 if (cap_from_name(t, &cap) < 0) {
263 log_error("Failed to parse capability %s.", t);
269 arg_retain |= 1ULL << (uint64_t) cap;
276 arg_link_journal = LINK_GUEST;
279 case ARG_LINK_JOURNAL:
280 if (streq(optarg, "auto"))
281 arg_link_journal = LINK_AUTO;
282 else if (streq(optarg, "no"))
283 arg_link_journal = LINK_NO;
284 else if (streq(optarg, "guest"))
285 arg_link_journal = LINK_GUEST;
286 else if (streq(optarg, "host"))
287 arg_link_journal = LINK_HOST;
289 log_error("Failed to parse link journal mode %s", optarg);
297 _cleanup_free_ char *a = NULL, *b = NULL;
302 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
304 e = strchr(optarg, ':');
306 a = strndup(optarg, e - optarg);
316 if (!path_is_absolute(a) || !path_is_absolute(b)) {
317 log_error("Invalid bind mount specification: %s", optarg);
321 r = strv_extend(x, a);
325 r = strv_extend(x, b);
336 log_error("Unknown option code %c", c);
344 static int mount_all(const char *dest) {
346 typedef struct MountPoint {
355 static const MountPoint mount_table[] = {
356 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
357 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
358 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
359 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
360 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
361 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
362 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
363 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
365 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
366 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
373 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
374 _cleanup_free_ char *where = NULL;
377 where = strjoin(dest, "/", mount_table[k].where, NULL);
381 t = path_is_mount_point(where, true);
383 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
391 /* Skip this entry if it is not a remount. */
392 if (mount_table[k].what && t > 0)
395 mkdir_p(where, 0755);
397 if (mount(mount_table[k].what,
400 mount_table[k].flags,
401 mount_table[k].options) < 0 &&
402 mount_table[k].fatal) {
404 log_error("mount(%s) failed: %m", where);
414 static int mount_binds(const char *dest, char **l, unsigned long flags) {
417 STRV_FOREACH_PAIR(x, y, l) {
418 _cleanup_free_ char *where = NULL;
420 where = strjoin(dest, "/", *y, NULL);
424 mkdir_p_label(where, 0755);
426 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
427 log_error("mount(%s) failed: %m", where);
431 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
432 log_error("mount(%s) failed: %m", where);
440 static int setup_timezone(const char *dest) {
441 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
447 /* Fix the timezone, if possible */
448 r = readlink_malloc("/etc/localtime", &p);
450 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
454 z = path_startswith(p, "../usr/share/zoneinfo/");
456 z = path_startswith(p, "/usr/share/zoneinfo/");
458 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
462 where = strappend(dest, "/etc/localtime");
466 r = readlink_malloc(where, &q);
468 y = path_startswith(q, "../usr/share/zoneinfo/");
470 y = path_startswith(q, "/usr/share/zoneinfo/");
473 /* Already pointing to the right place? Then do nothing .. */
474 if (y && streq(y, z))
478 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
482 if (access(check, F_OK) < 0) {
483 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
487 what = strappend("../usr/share/zoneinfo/", z);
492 if (symlink(what, where) < 0) {
493 log_error("Failed to correct timezone of container: %m");
500 static int setup_resolv_conf(const char *dest) {
501 char _cleanup_free_ *where = NULL;
502 _cleanup_close_ int fd = -1;
506 if (arg_private_network)
509 /* Fix resolv.conf, if possible */
510 where = strappend(dest, "/etc/resolv.conf");
514 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
516 /* We don't really care for the results of this really. If it
517 * fails, it fails, but meh... */
518 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
519 log_warning("Failed to bind mount /etc/resolv.conf: %m");
521 if (mount("/etc/resolv.conf", where, "bind",
522 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
523 log_error("Failed to remount /etc/resolv.conf readonly: %m");
530 static int setup_boot_id(const char *dest) {
531 _cleanup_free_ char *from = NULL, *to = NULL;
538 /* Generate a new randomized boot ID, so that each boot-up of
539 * the container gets a new one */
541 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
542 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
546 r = sd_id128_randomize(&rnd);
548 log_error("Failed to generate random boot id: %s", strerror(-r));
552 snprintf(as_uuid, sizeof(as_uuid),
553 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
554 SD_ID128_FORMAT_VAL(rnd));
555 char_array_0(as_uuid);
557 r = write_string_file(from, as_uuid);
559 log_error("Failed to write boot id: %s", strerror(-r));
563 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
564 log_error("Failed to bind mount boot id: %m");
566 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
567 log_warning("Failed to make boot id read-only: %m");
573 static int copy_devnodes(const char *dest) {
575 static const char devnodes[] =
585 _cleanup_umask_ mode_t u;
591 NULSTR_FOREACH(d, devnodes) {
593 _cleanup_free_ char *from = NULL, *to = NULL;
595 asprintf(&from, "/dev/%s", d);
596 asprintf(&to, "%s/dev/%s", dest, d);
607 if (stat(from, &st) < 0) {
609 if (errno != ENOENT) {
610 log_error("Failed to stat %s: %m", from);
615 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
617 log_error("%s is not a char or block device, cannot copy", from);
621 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
623 log_error("mknod(%s) failed: %m", dest);
632 static int setup_ptmx(const char *dest) {
633 _cleanup_free_ char *p = NULL;
635 p = strappend(dest, "/dev/ptmx");
639 if (symlink("pts/ptmx", p) < 0) {
640 log_error("Failed to create /dev/ptmx symlink: %m");
647 static int setup_dev_console(const char *dest, const char *console) {
649 _cleanup_free_ char *to = NULL;
651 _cleanup_umask_ mode_t u;
658 if (stat(console, &st) < 0) {
659 log_error("Failed to stat %s: %m", console);
662 } else if (!S_ISCHR(st.st_mode)) {
663 log_error("/dev/console is not a char device");
667 r = chmod_and_chown(console, 0600, 0, 0);
669 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
673 if (asprintf(&to, "%s/dev/console", dest) < 0)
676 /* We need to bind mount the right tty to /dev/console since
677 * ptys can only exist on pts file systems. To have something
678 * to bind mount things on we create a device node first, that
679 * has the right major/minor (note that the major minor
680 * doesn't actually matter here, since we mount it over
683 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
684 log_error("mknod() for /dev/console failed: %m");
688 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
689 log_error("Bind mount for /dev/console failed: %m");
696 static int setup_kmsg(const char *dest, int kmsg_socket) {
697 _cleanup_free_ char *from = NULL, *to = NULL;
699 _cleanup_umask_ mode_t u;
701 struct cmsghdr cmsghdr;
702 uint8_t buf[CMSG_SPACE(sizeof(int))];
705 .msg_control = &control,
706 .msg_controllen = sizeof(control),
708 struct cmsghdr *cmsg;
711 assert(kmsg_socket >= 0);
715 /* We create the kmsg FIFO as /dev/kmsg, but immediately
716 * delete it after bind mounting it to /proc/kmsg. While FIFOs
717 * on the reading side behave very similar to /proc/kmsg,
718 * their writing side behaves differently from /dev/kmsg in
719 * that writing blocks when nothing is reading. In order to
720 * avoid any problems with containers deadlocking due to this
721 * we simply make /dev/kmsg unavailable to the container. */
722 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
723 asprintf(&to, "%s/proc/kmsg", dest) < 0)
726 if (mkfifo(from, 0600) < 0) {
727 log_error("mkfifo() for /dev/kmsg failed: %m");
731 r = chmod_and_chown(from, 0600, 0, 0);
733 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
737 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
738 log_error("Bind mount for /proc/kmsg failed: %m");
742 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
744 log_error("Failed to open fifo: %m");
748 cmsg = CMSG_FIRSTHDR(&mh);
749 cmsg->cmsg_level = SOL_SOCKET;
750 cmsg->cmsg_type = SCM_RIGHTS;
751 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
752 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
754 mh.msg_controllen = cmsg->cmsg_len;
756 /* Store away the fd in the socket, so that it stays open as
757 * long as we run the child */
758 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
759 close_nointr_nofail(fd);
762 log_error("Failed to send FIFO fd: %m");
766 /* And now make the FIFO unavailable as /dev/kmsg... */
771 static int setup_hostname(void) {
773 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
779 static int setup_journal(const char *directory) {
780 sd_id128_t machine_id;
781 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
785 if (arg_link_journal == LINK_NO)
788 p = strappend(directory, "/etc/machine-id");
792 r = read_one_line_file(p, &b);
793 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
796 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
801 if (isempty(id) && arg_link_journal == LINK_AUTO)
804 /* Verify validity */
805 r = sd_id128_from_string(id, &machine_id);
807 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
812 p = strappend("/var/log/journal/", id);
813 q = strjoin(directory, "/var/log/journal/", id, NULL);
817 if (path_is_mount_point(p, false) > 0) {
818 if (arg_link_journal != LINK_AUTO) {
819 log_error("%s: already a mount point, refusing to use for journal", p);
826 if (path_is_mount_point(q, false) > 0) {
827 if (arg_link_journal != LINK_AUTO) {
828 log_error("%s: already a mount point, refusing to use for journal", q);
835 r = readlink_and_make_absolute(p, &d);
837 if ((arg_link_journal == LINK_GUEST ||
838 arg_link_journal == LINK_AUTO) &&
841 r = mkdir_p(q, 0755);
843 log_warning("failed to create directory %s: %m", q);
848 log_error("Failed to remove symlink %s: %m", p);
851 } else if (r == -EINVAL) {
853 if (arg_link_journal == LINK_GUEST &&
856 if (errno == ENOTDIR) {
857 log_error("%s already exists and is neither a symlink nor a directory", p);
860 log_error("Failed to remove %s: %m", p);
864 } else if (r != -ENOENT) {
865 log_error("readlink(%s) failed: %m", p);
869 if (arg_link_journal == LINK_GUEST) {
871 if (symlink(q, p) < 0) {
872 log_error("Failed to symlink %s to %s: %m", q, p);
876 r = mkdir_p(q, 0755);
878 log_warning("failed to create directory %s: %m", q);
882 if (arg_link_journal == LINK_HOST) {
883 r = mkdir_p(p, 0755);
885 log_error("Failed to create %s: %m", p);
889 } else if (access(p, F_OK) < 0)
892 if (dir_is_empty(q) == 0) {
893 log_error("%s not empty.", q);
897 r = mkdir_p(q, 0755);
899 log_error("Failed to create %s: %m", q);
903 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
904 log_error("Failed to bind mount journal from host into guest: %m");
911 static int setup_cgroup(const char *path) {
915 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
917 log_error("Failed to create cgroup: %s", strerror(-r));
921 STRV_FOREACH(c, arg_controllers) {
922 r = cg_create_and_attach(*c, path, 1);
924 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
930 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
931 _cleanup_free_ char *path = NULL;
932 char buf[DECIMAL_STR_MAX(pid_t)];
937 assert(arg_directory);
940 assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
942 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
944 log_error("Failed to get path: %s", strerror(-r));
948 r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
950 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
953 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
955 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
961 k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
963 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
971 static int drop_capabilities(void) {
972 return capability_bounding_set_drop(~arg_retain, false);
975 static int process_pty(int master, pid_t pid, sigset_t *mask) {
977 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
978 size_t in_buffer_full = 0, out_buffer_full = 0;
979 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
980 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
981 int ep = -1, signal_fd = -1, r;
982 bool tried_orderly_shutdown = false;
988 fd_nonblock(STDIN_FILENO, 1);
989 fd_nonblock(STDOUT_FILENO, 1);
990 fd_nonblock(master, 1);
992 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
994 log_error("signalfd(): %m");
999 ep = epoll_create1(EPOLL_CLOEXEC);
1001 log_error("Failed to create epoll: %m");
1006 /* We read from STDIN only if this is actually a TTY,
1007 * otherwise we assume non-interactivity. */
1008 if (isatty(STDIN_FILENO)) {
1010 stdin_ev.events = EPOLLIN|EPOLLET;
1011 stdin_ev.data.fd = STDIN_FILENO;
1013 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1014 log_error("Failed to register STDIN in epoll: %m");
1021 stdout_ev.events = EPOLLOUT|EPOLLET;
1022 stdout_ev.data.fd = STDOUT_FILENO;
1025 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1026 master_ev.data.fd = master;
1029 signal_ev.events = EPOLLIN;
1030 signal_ev.data.fd = signal_fd;
1032 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1033 if (errno != EPERM) {
1034 log_error("Failed to register stdout in epoll: %m");
1038 /* stdout without epoll support. Likely redirected to regular file. */
1039 stdout_writable = true;
1042 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1043 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1044 log_error("Failed to register fds in epoll: %m");
1050 struct epoll_event ev[16];
1054 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1057 if (errno == EINTR || errno == EAGAIN)
1060 log_error("epoll_wait(): %m");
1067 for (i = 0; i < nfds; i++) {
1068 if (ev[i].data.fd == STDIN_FILENO) {
1070 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1071 stdin_readable = true;
1073 } else if (ev[i].data.fd == STDOUT_FILENO) {
1075 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1076 stdout_writable = true;
1078 } else if (ev[i].data.fd == master) {
1080 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1081 master_readable = true;
1083 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1084 master_writable = true;
1086 } else if (ev[i].data.fd == signal_fd) {
1087 struct signalfd_siginfo sfsi;
1090 n = read(signal_fd, &sfsi, sizeof(sfsi));
1091 if (n != sizeof(sfsi)) {
1094 log_error("Failed to read from signalfd: invalid block size");
1099 if (errno != EINTR && errno != EAGAIN) {
1100 log_error("Failed to read from signalfd: %m");
1106 if (sfsi.ssi_signo == SIGWINCH) {
1109 /* The window size changed, let's forward that. */
1110 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1111 ioctl(master, TIOCSWINSZ, &ws);
1112 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1114 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1116 /* This only works for systemd... */
1117 tried_orderly_shutdown = true;
1118 kill(pid, SIGRTMIN+3);
1128 while ((stdin_readable && in_buffer_full <= 0) ||
1129 (master_writable && in_buffer_full > 0) ||
1130 (master_readable && out_buffer_full <= 0) ||
1131 (stdout_writable && out_buffer_full > 0)) {
1133 if (stdin_readable && in_buffer_full < LINE_MAX) {
1135 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1138 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1139 stdin_readable = false;
1141 log_error("read(): %m");
1146 in_buffer_full += (size_t) k;
1149 if (master_writable && in_buffer_full > 0) {
1151 k = write(master, in_buffer, in_buffer_full);
1154 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1155 master_writable = false;
1157 log_error("write(): %m");
1163 assert(in_buffer_full >= (size_t) k);
1164 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1165 in_buffer_full -= k;
1169 if (master_readable && out_buffer_full < LINE_MAX) {
1171 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1174 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1175 master_readable = false;
1177 log_error("read(): %m");
1182 out_buffer_full += (size_t) k;
1185 if (stdout_writable && out_buffer_full > 0) {
1187 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1190 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1191 stdout_writable = false;
1193 log_error("write(): %m");
1199 assert(out_buffer_full >= (size_t) k);
1200 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1201 out_buffer_full -= k;
1209 close_nointr_nofail(ep);
1212 close_nointr_nofail(signal_fd);
1217 int main(int argc, char *argv[]) {
1219 int r = EXIT_FAILURE, k;
1220 _cleanup_free_ char *machine_root = NULL, *name = NULL, *escaped = NULL, *newcg = NULL;
1221 _cleanup_close_ int master = -1;
1223 const char *console = NULL;
1224 struct termios saved_attr, raw_attr;
1226 bool saved_attr_valid = false;
1228 int kmsg_socket_pair[2] = { -1, -1 };
1231 log_parse_environment();
1234 r = parse_argv(argc, argv);
1238 if (arg_directory) {
1241 p = path_make_absolute_cwd(arg_directory);
1242 free(arg_directory);
1245 arg_directory = get_current_dir_name();
1247 if (!arg_directory) {
1248 log_error("Failed to determine path, please use -D.");
1252 path_kill_slashes(arg_directory);
1255 arg_machine = strdup(path_get_file_name(arg_directory));
1261 hostname_cleanup(arg_machine);
1262 if (isempty(arg_machine)) {
1263 log_error("Failed to determine machine name automatically, please use -M.");
1268 if (geteuid() != 0) {
1269 log_error("Need to be root.");
1273 if (sd_booted() <= 0) {
1274 log_error("Not running on a systemd system.");
1278 if (path_equal(arg_directory, "/")) {
1279 log_error("Spawning container on root directory not supported.");
1283 if (path_is_os_tree(arg_directory) <= 0) {
1284 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1289 n_fd_passed = sd_listen_fds(false);
1290 if (n_fd_passed > 0) {
1291 k = fdset_new_listen_fds(&fds, false);
1293 log_error("Failed to collect file descriptors: %s", strerror(-k));
1297 fdset_close_others(fds);
1300 k = cg_get_machine_path(&machine_root);
1302 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1306 name = strappend(arg_machine, ".nspawn");
1312 escaped = cg_escape(name);
1318 newcg = strjoin(machine_root, "/", escaped, NULL);
1324 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1325 if (r <= 0 && r != -ENOENT) {
1326 log_error("Container already running.");
1334 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1336 log_error("Failed to acquire pseudo tty: %m");
1340 console = ptsname(master);
1342 log_error("Failed to determine tty name: %m");
1346 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1348 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1349 ioctl(master, TIOCSWINSZ, &ws);
1351 if (unlockpt(master) < 0) {
1352 log_error("Failed to unlock tty: %m");
1356 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1357 saved_attr_valid = true;
1359 raw_attr = saved_attr;
1360 cfmakeraw(&raw_attr);
1361 raw_attr.c_lflag &= ~ECHO;
1364 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1365 log_error("Failed to create kmsg socket pair.");
1369 assert_se(sigemptyset(&mask) == 0);
1370 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1371 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1375 int pipefd[2], pipefd2[2];
1377 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1378 log_error("pipe2(): %m");
1382 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1383 log_error("pipe2(): %m");
1388 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1390 if (errno == EINVAL)
1391 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1393 log_error("clone() failed: %m");
1400 const char *home = NULL;
1401 uid_t uid = (uid_t) -1;
1402 gid_t gid = (gid_t) -1;
1404 const char *envp[] = {
1405 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1406 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1411 NULL, /* container_uuid */
1412 NULL, /* LISTEN_FDS */
1413 NULL, /* LISTEN_PID */
1417 envp[n_env] = strv_find_prefix(environ, "TERM=");
1421 /* Wait for the parent process to log our PID */
1422 close_nointr_nofail(pipefd[1]);
1423 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1424 close_nointr_nofail(pipefd[0]);
1426 close_nointr_nofail(master);
1429 if (saved_attr_valid) {
1430 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1431 log_error("Failed to set terminal attributes: %m");
1436 close_nointr(STDIN_FILENO);
1437 close_nointr(STDOUT_FILENO);
1438 close_nointr(STDERR_FILENO);
1440 close_nointr_nofail(kmsg_socket_pair[0]);
1441 kmsg_socket_pair[0] = -1;
1443 reset_all_signal_handlers();
1445 assert_se(sigemptyset(&mask) == 0);
1446 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1448 k = open_terminal(console, O_RDWR);
1449 if (k != STDIN_FILENO) {
1451 close_nointr_nofail(k);
1455 log_error("Failed to open console: %s", strerror(-k));
1459 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1460 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1461 log_error("Failed to duplicate console: %m");
1466 log_error("setsid() failed: %m");
1470 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1471 log_error("PR_SET_PDEATHSIG failed: %m");
1475 if (setup_cgroup(newcg) < 0)
1478 close_pipe(pipefd2);
1480 /* Mark everything as slave, so that we still
1481 * receive mounts from the real root, but don't
1482 * propagate mounts to the real root. */
1483 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1484 log_error("MS_SLAVE|MS_REC failed: %m");
1488 /* Turn directory into bind mount */
1489 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1490 log_error("Failed to make bind mount.");
1495 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1496 log_error("Failed to make read-only.");
1500 if (mount_all(arg_directory) < 0)
1503 if (copy_devnodes(arg_directory) < 0)
1506 if (setup_ptmx(arg_directory) < 0)
1509 dev_setup(arg_directory);
1511 if (setup_dev_console(arg_directory, console) < 0)
1514 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1517 close_nointr_nofail(kmsg_socket_pair[1]);
1518 kmsg_socket_pair[1] = -1;
1520 if (setup_boot_id(arg_directory) < 0)
1523 if (setup_timezone(arg_directory) < 0)
1526 if (setup_resolv_conf(arg_directory) < 0)
1529 if (setup_journal(arg_directory) < 0)
1532 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1535 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1538 if (chdir(arg_directory) < 0) {
1539 log_error("chdir(%s) failed: %m", arg_directory);
1543 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1544 log_error("mount(MS_MOVE) failed: %m");
1548 if (chroot(".") < 0) {
1549 log_error("chroot() failed: %m");
1553 if (chdir("/") < 0) {
1554 log_error("chdir() failed: %m");
1562 if (drop_capabilities() < 0) {
1563 log_error("drop_capabilities() failed: %m");
1569 /* Note that this resolves user names
1570 * inside the container, and hence
1571 * accesses the NSS modules from the
1572 * container and not the host. This is
1575 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1576 log_error("get_user_creds() failed: %m");
1580 if (mkdir_parents_label(home, 0775) < 0) {
1581 log_error("mkdir_parents_label() failed: %m");
1585 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1586 log_error("mkdir_safe_label() failed: %m");
1590 if (initgroups((const char*)arg_user, gid) < 0) {
1591 log_error("initgroups() failed: %m");
1595 if (setresgid(gid, gid, gid) < 0) {
1596 log_error("setregid() failed: %m");
1600 if (setresuid(uid, uid, uid) < 0) {
1601 log_error("setreuid() failed: %m");
1605 /* Reset everything fully to 0, just in case */
1607 if (setgroups(0, NULL) < 0) {
1608 log_error("setgroups() failed: %m");
1612 if (setresgid(0, 0, 0) < 0) {
1613 log_error("setregid() failed: %m");
1617 if (setresuid(0, 0, 0) < 0) {
1618 log_error("setreuid() failed: %m");
1623 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1624 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1625 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1631 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1637 if (fdset_size(fds) > 0) {
1638 k = fdset_cloexec(fds, false);
1640 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1644 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1645 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1657 /* Automatically search for the init system */
1659 l = 1 + argc - optind;
1660 a = newa(char*, l + 1);
1661 memcpy(a + 1, argv + optind, l * sizeof(char*));
1663 a[0] = (char*) "/usr/lib/systemd/systemd";
1664 execve(a[0], a, (char**) envp);
1666 a[0] = (char*) "/lib/systemd/systemd";
1667 execve(a[0], a, (char**) envp);
1669 a[0] = (char*) "/sbin/init";
1670 execve(a[0], a, (char**) envp);
1671 } else if (argc > optind)
1672 execvpe(argv[optind], argv + optind, (char**) envp);
1674 chdir(home ? home : "/root");
1675 execle("/bin/bash", "-bash", NULL, (char**) envp);
1678 log_error("execv() failed: %m");
1681 _exit(EXIT_FAILURE);
1684 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1685 close_nointr_nofail(pipefd[0]);
1686 close_nointr_nofail(pipefd[1]);
1688 /* Wait for the child process to establish cgroup hierarchy */
1689 close_nointr_nofail(pipefd2[1]);
1690 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1691 close_nointr_nofail(pipefd2[0]);
1693 save_attributes(newcg, pid, arg_uuid, arg_directory);
1698 if (process_pty(master, pid, &mask) < 0)
1701 if (saved_attr_valid)
1702 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1704 r = wait_for_terminate(pid, &status);
1710 if (status.si_code == CLD_EXITED) {
1711 if (status.si_status != 0) {
1712 log_error("Container failed with error code %i.", status.si_status);
1713 r = status.si_status;
1717 log_debug("Container exited successfully.");
1719 } else if (status.si_code == CLD_KILLED &&
1720 status.si_status == SIGINT) {
1721 log_info("Container has been shut down.");
1724 } else if (status.si_code == CLD_KILLED &&
1725 status.si_status == SIGHUP) {
1726 log_info("Container is being rebooted.");
1728 } else if (status.si_code == CLD_KILLED ||
1729 status.si_code == CLD_DUMPED) {
1731 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1735 log_error("Container failed due to unknown reason.");
1742 if (saved_attr_valid)
1743 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1745 close_pipe(kmsg_socket_pair);
1748 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1750 free(arg_directory);
1752 strv_free(arg_controllers);