1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <attr/xattr.h>
26 #include <sys/types.h>
27 #include <sys/syscall.h>
28 #include <sys/mount.h>
34 #include <sys/prctl.h>
35 #include <sys/capability.h>
38 #include <sys/epoll.h>
40 #include <sys/signalfd.h>
44 #include <sys/socket.h>
46 #include <systemd/sd-daemon.h>
54 #include "cgroup-util.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
59 #include "dev-setup.h"
68 typedef enum LinkJournal {
75 static char *arg_directory = NULL;
76 static char *arg_user = NULL;
77 static char **arg_controllers = NULL;
78 static char *arg_uuid = NULL;
79 static char *arg_machine = NULL;
80 static bool arg_private_network = false;
81 static bool arg_read_only = false;
82 static bool arg_boot = false;
83 static LinkJournal arg_link_journal = LINK_AUTO;
84 static uint64_t arg_retain =
86 (1ULL << CAP_DAC_OVERRIDE) |
87 (1ULL << CAP_DAC_READ_SEARCH) |
88 (1ULL << CAP_FOWNER) |
89 (1ULL << CAP_FSETID) |
90 (1ULL << CAP_IPC_OWNER) |
93 (1ULL << CAP_LINUX_IMMUTABLE) |
94 (1ULL << CAP_NET_BIND_SERVICE) |
95 (1ULL << CAP_NET_BROADCAST) |
96 (1ULL << CAP_NET_RAW) |
97 (1ULL << CAP_SETGID) |
98 (1ULL << CAP_SETFCAP) |
99 (1ULL << CAP_SETPCAP) |
100 (1ULL << CAP_SETUID) |
101 (1ULL << CAP_SYS_ADMIN) |
102 (1ULL << CAP_SYS_CHROOT) |
103 (1ULL << CAP_SYS_NICE) |
104 (1ULL << CAP_SYS_PTRACE) |
105 (1ULL << CAP_SYS_TTY_CONFIG) |
106 (1ULL << CAP_SYS_RESOURCE) |
107 (1ULL << CAP_SYS_BOOT) |
108 (1ULL << CAP_AUDIT_WRITE) |
109 (1ULL << CAP_AUDIT_CONTROL);
110 static char **arg_bind = NULL;
111 static char **arg_bind_ro = NULL;
113 static int help(void) {
115 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
116 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
117 " -h --help Show this help\n"
118 " --version Print version string\n"
119 " -D --directory=NAME Root directory for the container\n"
120 " -b --boot Boot up full system (i.e. invoke init)\n"
121 " -u --user=USER Run the command under specified user or uid\n"
122 " -C --controllers=LIST Put the container in specified comma-separated\n"
123 " cgroup hierarchies\n"
124 " --uuid=UUID Set a specific machine UUID for the container\n"
125 " -M --machine=NAME Set the machine name for the container\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135 program_invocation_short_name);
140 static int parse_argv(int argc, char *argv[]) {
153 static const struct option options[] = {
154 { "help", no_argument, NULL, 'h' },
155 { "version", no_argument, NULL, ARG_VERSION },
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "controllers", required_argument, NULL, 'C' },
159 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
160 { "boot", no_argument, NULL, 'b' },
161 { "uuid", required_argument, NULL, ARG_UUID },
162 { "read-only", no_argument, NULL, ARG_READ_ONLY },
163 { "capability", required_argument, NULL, ARG_CAPABILITY },
164 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
165 { "bind", required_argument, NULL, ARG_BIND },
166 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
167 { "machine", required_argument, NULL, 'M' },
176 while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
185 puts(PACKAGE_STRING);
186 puts(SYSTEMD_FEATURES);
191 arg_directory = canonicalize_file_name(optarg);
192 if (!arg_directory) {
193 log_error("Failed to canonicalize root directory.");
201 arg_user = strdup(optarg);
208 strv_free(arg_controllers);
209 arg_controllers = strv_split(optarg, ",");
210 if (!arg_controllers)
213 cg_shorten_controllers(arg_controllers);
216 case ARG_PRIVATE_NETWORK:
217 arg_private_network = true;
225 if (!id128_is_valid(optarg)) {
226 log_error("Invalid UUID: %s", optarg);
234 if (!hostname_is_valid(optarg)) {
235 log_error("Invalid machine name: %s", optarg);
240 arg_machine = strdup(optarg);
247 arg_read_only = true;
250 case ARG_CAPABILITY: {
254 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
258 t = strndup(word, length);
262 if (cap_from_name(t, &cap) < 0) {
263 log_error("Failed to parse capability %s.", t);
269 arg_retain |= 1ULL << (uint64_t) cap;
276 arg_link_journal = LINK_GUEST;
279 case ARG_LINK_JOURNAL:
280 if (streq(optarg, "auto"))
281 arg_link_journal = LINK_AUTO;
282 else if (streq(optarg, "no"))
283 arg_link_journal = LINK_NO;
284 else if (streq(optarg, "guest"))
285 arg_link_journal = LINK_GUEST;
286 else if (streq(optarg, "host"))
287 arg_link_journal = LINK_HOST;
289 log_error("Failed to parse link journal mode %s", optarg);
297 _cleanup_free_ char *a = NULL, *b = NULL;
302 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
304 e = strchr(optarg, ':');
306 a = strndup(optarg, e - optarg);
316 if (!path_is_absolute(a) || !path_is_absolute(b)) {
317 log_error("Invalid bind mount specification: %s", optarg);
321 r = strv_extend(x, a);
325 r = strv_extend(x, b);
336 log_error("Unknown option code %c", c);
344 static int mount_all(const char *dest) {
346 typedef struct MountPoint {
355 static const MountPoint mount_table[] = {
356 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
357 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
358 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
359 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
360 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
361 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
362 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
363 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
365 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
366 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
373 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
374 _cleanup_free_ char *where = NULL;
377 where = strjoin(dest, "/", mount_table[k].where, NULL);
381 t = path_is_mount_point(where, true);
383 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
391 /* Skip this entry if it is not a remount. */
392 if (mount_table[k].what && t > 0)
395 mkdir_p(where, 0755);
397 if (mount(mount_table[k].what,
400 mount_table[k].flags,
401 mount_table[k].options) < 0 &&
402 mount_table[k].fatal) {
404 log_error("mount(%s) failed: %m", where);
414 static int mount_binds(const char *dest, char **l, unsigned long flags) {
417 STRV_FOREACH_PAIR(x, y, l) {
418 _cleanup_free_ char *where = NULL;
420 where = strjoin(dest, "/", *y, NULL);
424 mkdir_p_label(where, 0755);
426 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
427 log_error("mount(%s) failed: %m", where);
431 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
432 log_error("mount(%s) failed: %m", where);
440 static int setup_timezone(const char *dest) {
441 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
447 /* Fix the timezone, if possible */
448 r = readlink_malloc("/etc/localtime", &p);
450 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
454 z = path_startswith(p, "../usr/share/zoneinfo/");
456 z = path_startswith(p, "/usr/share/zoneinfo/");
458 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
462 where = strappend(dest, "/etc/localtime");
466 r = readlink_malloc(where, &q);
468 y = path_startswith(q, "../usr/share/zoneinfo/");
470 y = path_startswith(q, "/usr/share/zoneinfo/");
473 /* Already pointing to the right place? Then do nothing .. */
474 if (y && streq(y, z))
478 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
482 if (access(check, F_OK) < 0) {
483 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
487 what = strappend("../usr/share/zoneinfo/", z);
492 if (symlink(what, where) < 0) {
493 log_error("Failed to correct timezone of container: %m");
500 static int setup_resolv_conf(const char *dest) {
501 char _cleanup_free_ *where = NULL;
502 _cleanup_close_ int fd = -1;
506 if (arg_private_network)
509 /* Fix resolv.conf, if possible */
510 where = strappend(dest, "/etc/resolv.conf");
514 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
516 /* We don't really care for the results of this really. If it
517 * fails, it fails, but meh... */
518 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
519 log_warning("Failed to bind mount /etc/resolv.conf: %m");
521 if (mount("/etc/resolv.conf", where, "bind",
522 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
523 log_error("Failed to remount /etc/resolv.conf readonly: %m");
530 static int setup_boot_id(const char *dest) {
531 _cleanup_free_ char *from = NULL, *to = NULL;
538 /* Generate a new randomized boot ID, so that each boot-up of
539 * the container gets a new one */
541 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
542 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
546 r = sd_id128_randomize(&rnd);
548 log_error("Failed to generate random boot id: %s", strerror(-r));
552 snprintf(as_uuid, sizeof(as_uuid),
553 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
554 SD_ID128_FORMAT_VAL(rnd));
555 char_array_0(as_uuid);
557 r = write_string_file(from, as_uuid);
559 log_error("Failed to write boot id: %s", strerror(-r));
563 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
564 log_error("Failed to bind mount boot id: %m");
566 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
567 log_warning("Failed to make boot id read-only: %m");
573 static int copy_devnodes(const char *dest) {
575 static const char devnodes[] =
585 _cleanup_umask_ mode_t u;
591 NULSTR_FOREACH(d, devnodes) {
593 _cleanup_free_ char *from = NULL, *to = NULL;
595 asprintf(&from, "/dev/%s", d);
596 asprintf(&to, "%s/dev/%s", dest, d);
607 if (stat(from, &st) < 0) {
609 if (errno != ENOENT) {
610 log_error("Failed to stat %s: %m", from);
615 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
617 log_error("%s is not a char or block device, cannot copy", from);
621 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
623 log_error("mknod(%s) failed: %m", dest);
632 static int setup_ptmx(const char *dest) {
633 _cleanup_free_ char *p = NULL;
635 p = strappend(dest, "/dev/ptmx");
639 if (symlink("pts/ptmx", p) < 0) {
640 log_error("Failed to create /dev/ptmx symlink: %m");
647 static int setup_dev_console(const char *dest, const char *console) {
649 _cleanup_free_ char *to = NULL;
651 _cleanup_umask_ mode_t u;
658 if (stat(console, &st) < 0) {
659 log_error("Failed to stat %s: %m", console);
662 } else if (!S_ISCHR(st.st_mode)) {
663 log_error("/dev/console is not a char device");
667 r = chmod_and_chown(console, 0600, 0, 0);
669 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
673 if (asprintf(&to, "%s/dev/console", dest) < 0)
676 /* We need to bind mount the right tty to /dev/console since
677 * ptys can only exist on pts file systems. To have something
678 * to bind mount things on we create a device node first, that
679 * has the right major/minor (note that the major minor
680 * doesn't actually matter here, since we mount it over
683 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
684 log_error("mknod() for /dev/console failed: %m");
688 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
689 log_error("Bind mount for /dev/console failed: %m");
696 static int setup_kmsg(const char *dest, int kmsg_socket) {
697 _cleanup_free_ char *from = NULL, *to = NULL;
699 _cleanup_umask_ mode_t u;
701 struct cmsghdr cmsghdr;
702 uint8_t buf[CMSG_SPACE(sizeof(int))];
705 .msg_control = &control,
706 .msg_controllen = sizeof(control),
708 struct cmsghdr *cmsg;
711 assert(kmsg_socket >= 0);
715 /* We create the kmsg FIFO as /dev/kmsg, but immediately
716 * delete it after bind mounting it to /proc/kmsg. While FIFOs
717 * on the reading side behave very similar to /proc/kmsg,
718 * their writing side behaves differently from /dev/kmsg in
719 * that writing blocks when nothing is reading. In order to
720 * avoid any problems with containers deadlocking due to this
721 * we simply make /dev/kmsg unavailable to the container. */
722 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
723 asprintf(&to, "%s/proc/kmsg", dest) < 0)
726 if (mkfifo(from, 0600) < 0) {
727 log_error("mkfifo() for /dev/kmsg failed: %m");
731 r = chmod_and_chown(from, 0600, 0, 0);
733 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
737 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
738 log_error("Bind mount for /proc/kmsg failed: %m");
742 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
744 log_error("Failed to open fifo: %m");
748 cmsg = CMSG_FIRSTHDR(&mh);
749 cmsg->cmsg_level = SOL_SOCKET;
750 cmsg->cmsg_type = SCM_RIGHTS;
751 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
752 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
754 mh.msg_controllen = cmsg->cmsg_len;
756 /* Store away the fd in the socket, so that it stays open as
757 * long as we run the child */
758 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
759 close_nointr_nofail(fd);
762 log_error("Failed to send FIFO fd: %m");
766 /* And now make the FIFO unavailable as /dev/kmsg... */
771 static int setup_hostname(void) {
773 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
779 static int setup_journal(const char *directory) {
780 sd_id128_t machine_id;
781 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
785 if (arg_link_journal == LINK_NO)
788 p = strappend(directory, "/etc/machine-id");
792 r = read_one_line_file(p, &b);
793 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
796 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
801 if (isempty(id) && arg_link_journal == LINK_AUTO)
804 /* Verify validity */
805 r = sd_id128_from_string(id, &machine_id);
807 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
812 p = strappend("/var/log/journal/", id);
813 q = strjoin(directory, "/var/log/journal/", id, NULL);
817 if (path_is_mount_point(p, false) > 0) {
818 if (arg_link_journal != LINK_AUTO) {
819 log_error("%s: already a mount point, refusing to use for journal", p);
826 if (path_is_mount_point(q, false) > 0) {
827 if (arg_link_journal != LINK_AUTO) {
828 log_error("%s: already a mount point, refusing to use for journal", q);
835 r = readlink_and_make_absolute(p, &d);
837 if ((arg_link_journal == LINK_GUEST ||
838 arg_link_journal == LINK_AUTO) &&
841 r = mkdir_p(q, 0755);
843 log_warning("failed to create directory %s: %m", q);
848 log_error("Failed to remove symlink %s: %m", p);
851 } else if (r == -EINVAL) {
853 if (arg_link_journal == LINK_GUEST &&
856 if (errno == ENOTDIR) {
857 log_error("%s already exists and is neither a symlink nor a directory", p);
860 log_error("Failed to remove %s: %m", p);
864 } else if (r != -ENOENT) {
865 log_error("readlink(%s) failed: %m", p);
869 if (arg_link_journal == LINK_GUEST) {
871 if (symlink(q, p) < 0) {
872 log_error("Failed to symlink %s to %s: %m", q, p);
876 r = mkdir_p(q, 0755);
878 log_warning("failed to create directory %s: %m", q);
882 if (arg_link_journal == LINK_HOST) {
883 r = mkdir_p(p, 0755);
885 log_error("Failed to create %s: %m", p);
889 } else if (access(p, F_OK) < 0)
892 if (dir_is_empty(q) == 0) {
893 log_error("%s not empty.", q);
897 r = mkdir_p(q, 0755);
899 log_error("Failed to create %s: %m", q);
903 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
904 log_error("Failed to bind mount journal from host into guest: %m");
911 static int setup_cgroup(const char *path) {
915 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
917 log_error("Failed to create cgroup: %s", strerror(-r));
921 STRV_FOREACH(c, arg_controllers) {
922 r = cg_create_and_attach(*c, path, 1);
924 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
930 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
931 _cleanup_free_ char *path = NULL;
932 char buf[DECIMAL_STR_MAX(pid_t)];
937 assert(arg_directory);
940 assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
942 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
944 log_error("Failed to get path: %s", strerror(-r));
948 r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
950 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
953 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
955 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
961 k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
963 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
971 static int drop_capabilities(void) {
972 return capability_bounding_set_drop(~arg_retain, false);
975 static int process_pty(int master, pid_t pid, sigset_t *mask) {
977 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
978 size_t in_buffer_full = 0, out_buffer_full = 0;
979 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
980 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
981 int ep = -1, signal_fd = -1, r;
982 bool tried_orderly_shutdown = false;
988 fd_nonblock(STDIN_FILENO, 1);
989 fd_nonblock(STDOUT_FILENO, 1);
990 fd_nonblock(master, 1);
992 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
994 log_error("signalfd(): %m");
999 ep = epoll_create1(EPOLL_CLOEXEC);
1001 log_error("Failed to create epoll: %m");
1006 /* We read from STDIN only if this is actually a TTY,
1007 * otherwise we assume non-interactivity. */
1008 if (isatty(STDIN_FILENO)) {
1010 stdin_ev.events = EPOLLIN|EPOLLET;
1011 stdin_ev.data.fd = STDIN_FILENO;
1013 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1014 log_error("Failed to register STDIN in epoll: %m");
1021 stdout_ev.events = EPOLLOUT|EPOLLET;
1022 stdout_ev.data.fd = STDOUT_FILENO;
1025 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1026 master_ev.data.fd = master;
1029 signal_ev.events = EPOLLIN;
1030 signal_ev.data.fd = signal_fd;
1032 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1033 if (errno != EPERM) {
1034 log_error("Failed to register stdout in epoll: %m");
1038 /* stdout without epoll support. Likely redirected to regular file. */
1039 stdout_writable = true;
1042 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1043 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1044 log_error("Failed to register fds in epoll: %m");
1050 struct epoll_event ev[16];
1054 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1057 if (errno == EINTR || errno == EAGAIN)
1060 log_error("epoll_wait(): %m");
1067 for (i = 0; i < nfds; i++) {
1068 if (ev[i].data.fd == STDIN_FILENO) {
1070 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1071 stdin_readable = true;
1073 } else if (ev[i].data.fd == STDOUT_FILENO) {
1075 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1076 stdout_writable = true;
1078 } else if (ev[i].data.fd == master) {
1080 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1081 master_readable = true;
1083 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1084 master_writable = true;
1086 } else if (ev[i].data.fd == signal_fd) {
1087 struct signalfd_siginfo sfsi;
1090 n = read(signal_fd, &sfsi, sizeof(sfsi));
1091 if (n != sizeof(sfsi)) {
1094 log_error("Failed to read from signalfd: invalid block size");
1099 if (errno != EINTR && errno != EAGAIN) {
1100 log_error("Failed to read from signalfd: %m");
1106 if (sfsi.ssi_signo == SIGWINCH) {
1109 /* The window size changed, let's forward that. */
1110 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1111 ioctl(master, TIOCSWINSZ, &ws);
1112 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1114 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1116 /* This only works for systemd... */
1117 tried_orderly_shutdown = true;
1118 kill(pid, SIGRTMIN+3);
1128 while ((stdin_readable && in_buffer_full <= 0) ||
1129 (master_writable && in_buffer_full > 0) ||
1130 (master_readable && out_buffer_full <= 0) ||
1131 (stdout_writable && out_buffer_full > 0)) {
1133 if (stdin_readable && in_buffer_full < LINE_MAX) {
1135 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1138 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1139 stdin_readable = false;
1141 log_error("read(): %m");
1146 in_buffer_full += (size_t) k;
1149 if (master_writable && in_buffer_full > 0) {
1151 k = write(master, in_buffer, in_buffer_full);
1154 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1155 master_writable = false;
1157 log_error("write(): %m");
1163 assert(in_buffer_full >= (size_t) k);
1164 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1165 in_buffer_full -= k;
1169 if (master_readable && out_buffer_full < LINE_MAX) {
1171 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1174 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1175 master_readable = false;
1177 log_error("read(): %m");
1182 out_buffer_full += (size_t) k;
1185 if (stdout_writable && out_buffer_full > 0) {
1187 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1190 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1191 stdout_writable = false;
1193 log_error("write(): %m");
1199 assert(out_buffer_full >= (size_t) k);
1200 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1201 out_buffer_full -= k;
1209 close_nointr_nofail(ep);
1212 close_nointr_nofail(signal_fd);
1217 int main(int argc, char *argv[]) {
1219 int r = EXIT_FAILURE, k;
1220 _cleanup_free_ char *newcg = NULL;
1221 _cleanup_close_ int master = -1;
1223 const char *console = NULL;
1224 struct termios saved_attr, raw_attr;
1226 bool saved_attr_valid = false;
1228 int kmsg_socket_pair[2] = { -1, -1 };
1231 log_parse_environment();
1234 k = parse_argv(argc, argv);
1242 if (arg_directory) {
1245 p = path_make_absolute_cwd(arg_directory);
1246 free(arg_directory);
1249 arg_directory = get_current_dir_name();
1251 if (!arg_directory) {
1252 log_error("Failed to determine path, please use -D.");
1256 path_kill_slashes(arg_directory);
1259 arg_machine = strdup(path_get_file_name(arg_directory));
1265 hostname_cleanup(arg_machine);
1266 if (isempty(arg_machine)) {
1267 log_error("Failed to determine machine name automatically, please use -M.");
1272 if (geteuid() != 0) {
1273 log_error("Need to be root.");
1277 if (sd_booted() <= 0) {
1278 log_error("Not running on a systemd system.");
1282 if (path_equal(arg_directory, "/")) {
1283 log_error("Spawning container on root directory not supported.");
1287 if (path_is_os_tree(arg_directory) <= 0) {
1288 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1293 n_fd_passed = sd_listen_fds(false);
1294 if (n_fd_passed > 0) {
1295 k = fdset_new_listen_fds(&fds, false);
1297 log_error("Failed to collect file descriptors: %s", strerror(-k));
1301 fdset_close_others(fds);
1304 k = cg_get_machine_path(arg_machine, &newcg);
1306 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1310 k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1311 if (k <= 0 && k != -ENOENT) {
1312 log_error("Container already running.");
1320 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1322 log_error("Failed to acquire pseudo tty: %m");
1326 console = ptsname(master);
1328 log_error("Failed to determine tty name: %m");
1332 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1334 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1335 ioctl(master, TIOCSWINSZ, &ws);
1337 if (unlockpt(master) < 0) {
1338 log_error("Failed to unlock tty: %m");
1342 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1343 saved_attr_valid = true;
1345 raw_attr = saved_attr;
1346 cfmakeraw(&raw_attr);
1347 raw_attr.c_lflag &= ~ECHO;
1350 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1351 log_error("Failed to create kmsg socket pair.");
1355 sd_notify(0, "READY=1");
1357 assert_se(sigemptyset(&mask) == 0);
1358 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1359 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1363 int pipefd[2], pipefd2[2];
1365 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1366 log_error("pipe2(): %m");
1370 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1371 log_error("pipe2(): %m");
1376 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1378 if (errno == EINVAL)
1379 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1381 log_error("clone() failed: %m");
1388 const char *home = NULL;
1389 uid_t uid = (uid_t) -1;
1390 gid_t gid = (gid_t) -1;
1392 const char *envp[] = {
1393 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1394 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1399 NULL, /* container_uuid */
1400 NULL, /* LISTEN_FDS */
1401 NULL, /* LISTEN_PID */
1405 envp[n_env] = strv_find_prefix(environ, "TERM=");
1409 /* Wait for the parent process to log our PID */
1410 close_nointr_nofail(pipefd[1]);
1411 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1412 close_nointr_nofail(pipefd[0]);
1414 close_nointr_nofail(master);
1417 if (saved_attr_valid) {
1418 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1419 log_error("Failed to set terminal attributes: %m");
1424 close_nointr(STDIN_FILENO);
1425 close_nointr(STDOUT_FILENO);
1426 close_nointr(STDERR_FILENO);
1428 close_nointr_nofail(kmsg_socket_pair[0]);
1429 kmsg_socket_pair[0] = -1;
1431 reset_all_signal_handlers();
1433 assert_se(sigemptyset(&mask) == 0);
1434 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1436 k = open_terminal(console, O_RDWR);
1437 if (k != STDIN_FILENO) {
1439 close_nointr_nofail(k);
1443 log_error("Failed to open console: %s", strerror(-k));
1447 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1448 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1449 log_error("Failed to duplicate console: %m");
1454 log_error("setsid() failed: %m");
1458 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1459 log_error("PR_SET_PDEATHSIG failed: %m");
1463 if (setup_cgroup(newcg) < 0)
1466 close_pipe(pipefd2);
1468 /* Mark everything as slave, so that we still
1469 * receive mounts from the real root, but don't
1470 * propagate mounts to the real root. */
1471 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1472 log_error("MS_SLAVE|MS_REC failed: %m");
1476 /* Turn directory into bind mount */
1477 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1478 log_error("Failed to make bind mount.");
1483 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1484 log_error("Failed to make read-only.");
1488 if (mount_all(arg_directory) < 0)
1491 if (copy_devnodes(arg_directory) < 0)
1494 if (setup_ptmx(arg_directory) < 0)
1497 dev_setup(arg_directory);
1499 if (setup_dev_console(arg_directory, console) < 0)
1502 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1505 close_nointr_nofail(kmsg_socket_pair[1]);
1506 kmsg_socket_pair[1] = -1;
1508 if (setup_boot_id(arg_directory) < 0)
1511 if (setup_timezone(arg_directory) < 0)
1514 if (setup_resolv_conf(arg_directory) < 0)
1517 if (setup_journal(arg_directory) < 0)
1520 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1523 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1526 if (chdir(arg_directory) < 0) {
1527 log_error("chdir(%s) failed: %m", arg_directory);
1531 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1532 log_error("mount(MS_MOVE) failed: %m");
1536 if (chroot(".") < 0) {
1537 log_error("chroot() failed: %m");
1541 if (chdir("/") < 0) {
1542 log_error("chdir() failed: %m");
1550 if (drop_capabilities() < 0) {
1551 log_error("drop_capabilities() failed: %m");
1557 /* Note that this resolves user names
1558 * inside the container, and hence
1559 * accesses the NSS modules from the
1560 * container and not the host. This is
1563 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1564 log_error("get_user_creds() failed: %m");
1568 if (mkdir_parents_label(home, 0775) < 0) {
1569 log_error("mkdir_parents_label() failed: %m");
1573 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1574 log_error("mkdir_safe_label() failed: %m");
1578 if (initgroups((const char*)arg_user, gid) < 0) {
1579 log_error("initgroups() failed: %m");
1583 if (setresgid(gid, gid, gid) < 0) {
1584 log_error("setregid() failed: %m");
1588 if (setresuid(uid, uid, uid) < 0) {
1589 log_error("setreuid() failed: %m");
1593 /* Reset everything fully to 0, just in case */
1595 if (setgroups(0, NULL) < 0) {
1596 log_error("setgroups() failed: %m");
1600 if (setresgid(0, 0, 0) < 0) {
1601 log_error("setregid() failed: %m");
1605 if (setresuid(0, 0, 0) < 0) {
1606 log_error("setreuid() failed: %m");
1611 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1612 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1613 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1619 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1625 if (fdset_size(fds) > 0) {
1626 k = fdset_cloexec(fds, false);
1628 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1632 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1633 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1645 /* Automatically search for the init system */
1647 l = 1 + argc - optind;
1648 a = newa(char*, l + 1);
1649 memcpy(a + 1, argv + optind, l * sizeof(char*));
1651 a[0] = (char*) "/usr/lib/systemd/systemd";
1652 execve(a[0], a, (char**) envp);
1654 a[0] = (char*) "/lib/systemd/systemd";
1655 execve(a[0], a, (char**) envp);
1657 a[0] = (char*) "/sbin/init";
1658 execve(a[0], a, (char**) envp);
1659 } else if (argc > optind)
1660 execvpe(argv[optind], argv + optind, (char**) envp);
1662 chdir(home ? home : "/root");
1663 execle("/bin/bash", "-bash", NULL, (char**) envp);
1666 log_error("execv() failed: %m");
1669 _exit(EXIT_FAILURE);
1672 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1673 close_nointr_nofail(pipefd[0]);
1674 close_nointr_nofail(pipefd[1]);
1676 /* Wait for the child process to establish cgroup hierarchy */
1677 close_nointr_nofail(pipefd2[1]);
1678 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1679 close_nointr_nofail(pipefd2[0]);
1681 save_attributes(newcg, pid, arg_uuid, arg_directory);
1686 if (process_pty(master, pid, &mask) < 0)
1689 if (saved_attr_valid)
1690 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1692 k = wait_for_terminate(pid, &status);
1698 if (status.si_code == CLD_EXITED) {
1699 r = status.si_status;
1700 if (status.si_status != 0) {
1701 log_error("Container failed with error code %i.", status.si_status);
1705 log_debug("Container exited successfully.");
1707 } else if (status.si_code == CLD_KILLED &&
1708 status.si_status == SIGINT) {
1709 log_info("Container has been shut down.");
1712 } else if (status.si_code == CLD_KILLED &&
1713 status.si_status == SIGHUP) {
1714 log_info("Container is being rebooted.");
1716 } else if (status.si_code == CLD_KILLED ||
1717 status.si_code == CLD_DUMPED) {
1719 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1723 log_error("Container failed due to unknown reason.");
1730 if (saved_attr_valid)
1731 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1733 close_pipe(kmsg_socket_pair);
1736 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1738 free(arg_directory);
1740 strv_free(arg_controllers);