1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
55 #include "cgroup-util.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
60 #include "dev-setup.h"
64 #include "bus-internal.h"
65 #include "bus-message.h"
71 typedef enum LinkJournal {
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
89 (1ULL << CAP_DAC_OVERRIDE) |
90 (1ULL << CAP_DAC_READ_SEARCH) |
91 (1ULL << CAP_FOWNER) |
92 (1ULL << CAP_FSETID) |
93 (1ULL << CAP_IPC_OWNER) |
96 (1ULL << CAP_LINUX_IMMUTABLE) |
97 (1ULL << CAP_NET_BIND_SERVICE) |
98 (1ULL << CAP_NET_BROADCAST) |
99 (1ULL << CAP_NET_RAW) |
100 (1ULL << CAP_SETGID) |
101 (1ULL << CAP_SETFCAP) |
102 (1ULL << CAP_SETPCAP) |
103 (1ULL << CAP_SETUID) |
104 (1ULL << CAP_SYS_ADMIN) |
105 (1ULL << CAP_SYS_CHROOT) |
106 (1ULL << CAP_SYS_NICE) |
107 (1ULL << CAP_SYS_PTRACE) |
108 (1ULL << CAP_SYS_TTY_CONFIG) |
109 (1ULL << CAP_SYS_RESOURCE) |
110 (1ULL << CAP_SYS_BOOT) |
111 (1ULL << CAP_AUDIT_WRITE) |
112 (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
116 static int help(void) {
118 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120 " -h --help Show this help\n"
121 " --version Print version string\n"
122 " -D --directory=NAME Root directory for the container\n"
123 " -b --boot Boot up full system (i.e. invoke init)\n"
124 " -u --user=USER Run the command under specified user or uid\n"
125 " --uuid=UUID Set a specific machine UUID for the container\n"
126 " -M --machine=NAME Set the machine name for the container\n"
127 " -S --slice=SLICE Place the container in the specified slice\n"
128 " --private-network Disable network in container\n"
129 " --read-only Mount the root directory read-only\n"
130 " --capability=CAP In addition to the default, retain specified\n"
132 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
133 " -j Equivalent to --link-journal=host\n"
134 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
136 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137 program_invocation_short_name);
142 static int parse_argv(int argc, char *argv[]) {
155 static const struct option options[] = {
156 { "help", no_argument, NULL, 'h' },
157 { "version", no_argument, NULL, ARG_VERSION },
158 { "directory", required_argument, NULL, 'D' },
159 { "user", required_argument, NULL, 'u' },
160 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
161 { "boot", no_argument, NULL, 'b' },
162 { "uuid", required_argument, NULL, ARG_UUID },
163 { "read-only", no_argument, NULL, ARG_READ_ONLY },
164 { "capability", required_argument, NULL, ARG_CAPABILITY },
165 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
166 { "bind", required_argument, NULL, ARG_BIND },
167 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
168 { "machine", required_argument, NULL, 'M' },
169 { "slice", required_argument, NULL, 'S' },
178 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
187 puts(PACKAGE_STRING);
188 puts(SYSTEMD_FEATURES);
193 arg_directory = canonicalize_file_name(optarg);
194 if (!arg_directory) {
195 log_error("Failed to canonicalize root directory.");
203 arg_user = strdup(optarg);
209 case ARG_PRIVATE_NETWORK:
210 arg_private_network = true;
218 r = sd_id128_from_string(optarg, &arg_uuid);
220 log_error("Invalid UUID: %s", optarg);
226 arg_slice = strdup(optarg);
230 if (!hostname_is_valid(optarg)) {
231 log_error("Invalid machine name: %s", optarg);
236 arg_machine = strdup(optarg);
243 arg_read_only = true;
246 case ARG_CAPABILITY: {
250 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
254 t = strndup(word, length);
258 if (cap_from_name(t, &cap) < 0) {
259 log_error("Failed to parse capability %s.", t);
265 arg_retain |= 1ULL << (uint64_t) cap;
272 arg_link_journal = LINK_GUEST;
275 case ARG_LINK_JOURNAL:
276 if (streq(optarg, "auto"))
277 arg_link_journal = LINK_AUTO;
278 else if (streq(optarg, "no"))
279 arg_link_journal = LINK_NO;
280 else if (streq(optarg, "guest"))
281 arg_link_journal = LINK_GUEST;
282 else if (streq(optarg, "host"))
283 arg_link_journal = LINK_HOST;
285 log_error("Failed to parse link journal mode %s", optarg);
293 _cleanup_free_ char *a = NULL, *b = NULL;
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
299 e = strchr(optarg, ':');
301 a = strndup(optarg, e - optarg);
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
316 r = strv_extend(x, a);
320 r = strv_extend(x, b);
331 log_error("Unknown option code %c", c);
339 static int mount_all(const char *dest) {
341 typedef struct MountPoint {
350 static const MountPoint mount_table[] = {
351 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
352 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
353 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
354 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
355 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
356 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
360 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
361 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
368 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369 _cleanup_free_ char *where = NULL;
372 where = strjoin(dest, "/", mount_table[k].where, NULL);
376 t = path_is_mount_point(where, true);
378 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
386 /* Skip this entry if it is not a remount. */
387 if (mount_table[k].what && t > 0)
390 mkdir_p(where, 0755);
392 if (mount(mount_table[k].what,
395 mount_table[k].flags,
396 mount_table[k].options) < 0 &&
397 mount_table[k].fatal) {
399 log_error("mount(%s) failed: %m", where);
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
412 STRV_FOREACH_PAIR(x, y, l) {
413 _cleanup_free_ char *where = NULL;
414 struct stat source_st, dest_st;
416 if (stat(*x, &source_st) < 0) {
417 log_error("failed to stat %s: %m", *x);
421 where = strjoin(dest, "/", *y, NULL);
425 if (stat(where, &dest_st) == 0) {
426 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
427 log_error("The file types of %s and %s do not match. Refusing bind mount",
432 /* Create the mount point, but be conservative -- refuse to create block
433 * and char devices. */
434 if (S_ISDIR(source_st.st_mode))
435 mkdir_p_label(where, 0755);
436 else if (S_ISFIFO(source_st.st_mode))
438 else if (S_ISSOCK(source_st.st_mode))
439 mknod(where, 0644 | S_IFSOCK, 0);
440 else if (S_ISREG(source_st.st_mode))
443 log_error("Refusing to create mountpoint for file: %s", *x);
448 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
449 log_error("mount(%s) failed: %m", where);
453 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
454 log_error("mount(%s) failed: %m", where);
462 static int setup_timezone(const char *dest) {
463 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
469 /* Fix the timezone, if possible */
470 r = readlink_malloc("/etc/localtime", &p);
472 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
476 z = path_startswith(p, "../usr/share/zoneinfo/");
478 z = path_startswith(p, "/usr/share/zoneinfo/");
480 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
484 where = strappend(dest, "/etc/localtime");
488 r = readlink_malloc(where, &q);
490 y = path_startswith(q, "../usr/share/zoneinfo/");
492 y = path_startswith(q, "/usr/share/zoneinfo/");
495 /* Already pointing to the right place? Then do nothing .. */
496 if (y && streq(y, z))
500 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
504 if (access(check, F_OK) < 0) {
505 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
509 what = strappend("../usr/share/zoneinfo/", z);
514 if (symlink(what, where) < 0) {
515 log_error("Failed to correct timezone of container: %m");
522 static int setup_resolv_conf(const char *dest) {
523 char _cleanup_free_ *where = NULL;
524 _cleanup_close_ int fd = -1;
528 if (arg_private_network)
531 /* Fix resolv.conf, if possible */
532 where = strappend(dest, "/etc/resolv.conf");
536 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
538 /* We don't really care for the results of this really. If it
539 * fails, it fails, but meh... */
540 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
541 log_warning("Failed to bind mount /etc/resolv.conf: %m");
543 if (mount("/etc/resolv.conf", where, "bind",
544 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
545 log_error("Failed to remount /etc/resolv.conf readonly: %m");
552 static int setup_boot_id(const char *dest) {
553 _cleanup_free_ char *from = NULL, *to = NULL;
560 /* Generate a new randomized boot ID, so that each boot-up of
561 * the container gets a new one */
563 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
564 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
568 r = sd_id128_randomize(&rnd);
570 log_error("Failed to generate random boot id: %s", strerror(-r));
574 snprintf(as_uuid, sizeof(as_uuid),
575 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
576 SD_ID128_FORMAT_VAL(rnd));
577 char_array_0(as_uuid);
579 r = write_string_file(from, as_uuid);
581 log_error("Failed to write boot id: %s", strerror(-r));
585 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
586 log_error("Failed to bind mount boot id: %m");
588 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
589 log_warning("Failed to make boot id read-only: %m");
595 static int copy_devnodes(const char *dest) {
597 static const char devnodes[] =
607 _cleanup_umask_ mode_t u;
613 NULSTR_FOREACH(d, devnodes) {
615 _cleanup_free_ char *from = NULL, *to = NULL;
617 asprintf(&from, "/dev/%s", d);
618 asprintf(&to, "%s/dev/%s", dest, d);
629 if (stat(from, &st) < 0) {
631 if (errno != ENOENT) {
632 log_error("Failed to stat %s: %m", from);
637 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
639 log_error("%s is not a char or block device, cannot copy", from);
643 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
645 log_error("mknod(%s) failed: %m", dest);
654 static int setup_ptmx(const char *dest) {
655 _cleanup_free_ char *p = NULL;
657 p = strappend(dest, "/dev/ptmx");
661 if (symlink("pts/ptmx", p) < 0) {
662 log_error("Failed to create /dev/ptmx symlink: %m");
669 static int setup_dev_console(const char *dest, const char *console) {
671 _cleanup_free_ char *to = NULL;
673 _cleanup_umask_ mode_t u;
680 if (stat(console, &st) < 0) {
681 log_error("Failed to stat %s: %m", console);
684 } else if (!S_ISCHR(st.st_mode)) {
685 log_error("/dev/console is not a char device");
689 r = chmod_and_chown(console, 0600, 0, 0);
691 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
695 if (asprintf(&to, "%s/dev/console", dest) < 0)
698 /* We need to bind mount the right tty to /dev/console since
699 * ptys can only exist on pts file systems. To have something
700 * to bind mount things on we create a device node first, that
701 * has the right major/minor (note that the major minor
702 * doesn't actually matter here, since we mount it over
705 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
706 log_error("mknod() for /dev/console failed: %m");
710 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
711 log_error("Bind mount for /dev/console failed: %m");
718 static int setup_kmsg(const char *dest, int kmsg_socket) {
719 _cleanup_free_ char *from = NULL, *to = NULL;
721 _cleanup_umask_ mode_t u;
723 struct cmsghdr cmsghdr;
724 uint8_t buf[CMSG_SPACE(sizeof(int))];
727 .msg_control = &control,
728 .msg_controllen = sizeof(control),
730 struct cmsghdr *cmsg;
733 assert(kmsg_socket >= 0);
737 /* We create the kmsg FIFO as /dev/kmsg, but immediately
738 * delete it after bind mounting it to /proc/kmsg. While FIFOs
739 * on the reading side behave very similar to /proc/kmsg,
740 * their writing side behaves differently from /dev/kmsg in
741 * that writing blocks when nothing is reading. In order to
742 * avoid any problems with containers deadlocking due to this
743 * we simply make /dev/kmsg unavailable to the container. */
744 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
745 asprintf(&to, "%s/proc/kmsg", dest) < 0)
748 if (mkfifo(from, 0600) < 0) {
749 log_error("mkfifo() for /dev/kmsg failed: %m");
753 r = chmod_and_chown(from, 0600, 0, 0);
755 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
759 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
760 log_error("Bind mount for /proc/kmsg failed: %m");
764 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
766 log_error("Failed to open fifo: %m");
770 cmsg = CMSG_FIRSTHDR(&mh);
771 cmsg->cmsg_level = SOL_SOCKET;
772 cmsg->cmsg_type = SCM_RIGHTS;
773 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
774 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
776 mh.msg_controllen = cmsg->cmsg_len;
778 /* Store away the fd in the socket, so that it stays open as
779 * long as we run the child */
780 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
781 close_nointr_nofail(fd);
784 log_error("Failed to send FIFO fd: %m");
788 /* And now make the FIFO unavailable as /dev/kmsg... */
793 static int setup_hostname(void) {
795 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
801 static int setup_journal(const char *directory) {
802 sd_id128_t machine_id;
803 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
807 if (arg_link_journal == LINK_NO)
810 p = strappend(directory, "/etc/machine-id");
814 r = read_one_line_file(p, &b);
815 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
818 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
823 if (isempty(id) && arg_link_journal == LINK_AUTO)
826 /* Verify validity */
827 r = sd_id128_from_string(id, &machine_id);
829 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
834 p = strappend("/var/log/journal/", id);
835 q = strjoin(directory, "/var/log/journal/", id, NULL);
839 if (path_is_mount_point(p, false) > 0) {
840 if (arg_link_journal != LINK_AUTO) {
841 log_error("%s: already a mount point, refusing to use for journal", p);
848 if (path_is_mount_point(q, false) > 0) {
849 if (arg_link_journal != LINK_AUTO) {
850 log_error("%s: already a mount point, refusing to use for journal", q);
857 r = readlink_and_make_absolute(p, &d);
859 if ((arg_link_journal == LINK_GUEST ||
860 arg_link_journal == LINK_AUTO) &&
863 r = mkdir_p(q, 0755);
865 log_warning("failed to create directory %s: %m", q);
870 log_error("Failed to remove symlink %s: %m", p);
873 } else if (r == -EINVAL) {
875 if (arg_link_journal == LINK_GUEST &&
878 if (errno == ENOTDIR) {
879 log_error("%s already exists and is neither a symlink nor a directory", p);
882 log_error("Failed to remove %s: %m", p);
886 } else if (r != -ENOENT) {
887 log_error("readlink(%s) failed: %m", p);
891 if (arg_link_journal == LINK_GUEST) {
893 if (symlink(q, p) < 0) {
894 log_error("Failed to symlink %s to %s: %m", q, p);
898 r = mkdir_p(q, 0755);
900 log_warning("failed to create directory %s: %m", q);
904 if (arg_link_journal == LINK_HOST) {
905 r = mkdir_p(p, 0755);
907 log_error("Failed to create %s: %m", p);
911 } else if (access(p, F_OK) < 0)
914 if (dir_is_empty(q) == 0) {
915 log_error("%s not empty.", q);
919 r = mkdir_p(q, 0755);
921 log_error("Failed to create %s: %m", q);
925 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
926 log_error("Failed to bind mount journal from host into guest: %m");
933 static int drop_capabilities(void) {
934 return capability_bounding_set_drop(~arg_retain, false);
937 static int process_pty(int master, pid_t pid, sigset_t *mask) {
939 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
940 size_t in_buffer_full = 0, out_buffer_full = 0;
941 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
942 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
943 int ep = -1, signal_fd = -1, r;
944 bool tried_orderly_shutdown = false;
950 fd_nonblock(STDIN_FILENO, 1);
951 fd_nonblock(STDOUT_FILENO, 1);
952 fd_nonblock(master, 1);
954 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
956 log_error("signalfd(): %m");
961 ep = epoll_create1(EPOLL_CLOEXEC);
963 log_error("Failed to create epoll: %m");
968 /* We read from STDIN only if this is actually a TTY,
969 * otherwise we assume non-interactivity. */
970 if (isatty(STDIN_FILENO)) {
972 stdin_ev.events = EPOLLIN|EPOLLET;
973 stdin_ev.data.fd = STDIN_FILENO;
975 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
976 log_error("Failed to register STDIN in epoll: %m");
983 stdout_ev.events = EPOLLOUT|EPOLLET;
984 stdout_ev.data.fd = STDOUT_FILENO;
987 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
988 master_ev.data.fd = master;
991 signal_ev.events = EPOLLIN;
992 signal_ev.data.fd = signal_fd;
994 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
995 if (errno != EPERM) {
996 log_error("Failed to register stdout in epoll: %m");
1000 /* stdout without epoll support. Likely redirected to regular file. */
1001 stdout_writable = true;
1004 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1005 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1006 log_error("Failed to register fds in epoll: %m");
1012 struct epoll_event ev[16];
1016 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1019 if (errno == EINTR || errno == EAGAIN)
1022 log_error("epoll_wait(): %m");
1029 for (i = 0; i < nfds; i++) {
1030 if (ev[i].data.fd == STDIN_FILENO) {
1032 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1033 stdin_readable = true;
1035 } else if (ev[i].data.fd == STDOUT_FILENO) {
1037 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1038 stdout_writable = true;
1040 } else if (ev[i].data.fd == master) {
1042 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1043 master_readable = true;
1045 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1046 master_writable = true;
1048 } else if (ev[i].data.fd == signal_fd) {
1049 struct signalfd_siginfo sfsi;
1052 n = read(signal_fd, &sfsi, sizeof(sfsi));
1053 if (n != sizeof(sfsi)) {
1056 log_error("Failed to read from signalfd: invalid block size");
1061 if (errno != EINTR && errno != EAGAIN) {
1062 log_error("Failed to read from signalfd: %m");
1068 if (sfsi.ssi_signo == SIGWINCH) {
1071 /* The window size changed, let's forward that. */
1072 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1073 ioctl(master, TIOCSWINSZ, &ws);
1074 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1076 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1078 /* This only works for systemd... */
1079 tried_orderly_shutdown = true;
1080 kill(pid, SIGRTMIN+3);
1090 while ((stdin_readable && in_buffer_full <= 0) ||
1091 (master_writable && in_buffer_full > 0) ||
1092 (master_readable && out_buffer_full <= 0) ||
1093 (stdout_writable && out_buffer_full > 0)) {
1095 if (stdin_readable && in_buffer_full < LINE_MAX) {
1097 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1100 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1101 stdin_readable = false;
1103 log_error("read(): %m");
1108 in_buffer_full += (size_t) k;
1111 if (master_writable && in_buffer_full > 0) {
1113 k = write(master, in_buffer, in_buffer_full);
1116 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1117 master_writable = false;
1119 log_error("write(): %m");
1125 assert(in_buffer_full >= (size_t) k);
1126 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1127 in_buffer_full -= k;
1131 if (master_readable && out_buffer_full < LINE_MAX) {
1133 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1136 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1137 master_readable = false;
1139 log_error("read(): %m");
1144 out_buffer_full += (size_t) k;
1147 if (stdout_writable && out_buffer_full > 0) {
1149 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1152 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1153 stdout_writable = false;
1155 log_error("write(): %m");
1161 assert(out_buffer_full >= (size_t) k);
1162 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1163 out_buffer_full -= k;
1171 close_nointr_nofail(ep);
1174 close_nointr_nofail(signal_fd);
1179 static int register_machine(void) {
1180 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1181 _cleanup_bus_unref_ sd_bus *bus = NULL;
1184 r = sd_bus_open_system(&bus);
1186 log_error("Failed to open system bus: %s", strerror(-r));
1190 r = sd_bus_call_method(
1192 "org.freedesktop.machine1",
1193 "/org/freedesktop/machine1",
1194 "org.freedesktop.machine1.Manager",
1200 SD_BUS_APPEND_ID128(arg_uuid),
1204 strempty(arg_directory),
1205 1, "Slice", "s", strempty(arg_slice));
1207 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
1214 static bool audit_enabled(void) {
1217 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1219 close_nointr_nofail(fd);
1225 int main(int argc, char *argv[]) {
1227 int r = EXIT_FAILURE, k;
1228 _cleanup_close_ int master = -1;
1230 const char *console = NULL;
1231 struct termios saved_attr, raw_attr;
1233 bool saved_attr_valid = false;
1235 int kmsg_socket_pair[2] = { -1, -1 };
1238 log_parse_environment();
1241 k = parse_argv(argc, argv);
1249 if (arg_directory) {
1252 p = path_make_absolute_cwd(arg_directory);
1253 free(arg_directory);
1256 arg_directory = get_current_dir_name();
1258 if (!arg_directory) {
1259 log_error("Failed to determine path, please use -D.");
1263 path_kill_slashes(arg_directory);
1266 arg_machine = strdup(path_get_file_name(arg_directory));
1272 hostname_cleanup(arg_machine, false);
1273 if (isempty(arg_machine)) {
1274 log_error("Failed to determine machine name automatically, please use -M.");
1279 if (geteuid() != 0) {
1280 log_error("Need to be root.");
1284 if (sd_booted() <= 0) {
1285 log_error("Not running on a systemd system.");
1289 if (arg_boot && audit_enabled()) {
1290 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1291 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1292 "line before using systemd-nspawn. Sleeping for 5s...\n");
1296 if (path_equal(arg_directory, "/")) {
1297 log_error("Spawning container on root directory not supported.");
1301 if (path_is_os_tree(arg_directory) <= 0) {
1302 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1307 n_fd_passed = sd_listen_fds(false);
1308 if (n_fd_passed > 0) {
1309 k = fdset_new_listen_fds(&fds, false);
1311 log_error("Failed to collect file descriptors: %s", strerror(-k));
1315 fdset_close_others(fds);
1318 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1320 log_error("Failed to acquire pseudo tty: %m");
1324 console = ptsname(master);
1326 log_error("Failed to determine tty name: %m");
1330 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1332 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1333 ioctl(master, TIOCSWINSZ, &ws);
1335 if (unlockpt(master) < 0) {
1336 log_error("Failed to unlock tty: %m");
1340 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1341 saved_attr_valid = true;
1343 raw_attr = saved_attr;
1344 cfmakeraw(&raw_attr);
1345 raw_attr.c_lflag &= ~ECHO;
1348 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1349 log_error("Failed to create kmsg socket pair.");
1353 sd_notify(0, "READY=1");
1355 assert_se(sigemptyset(&mask) == 0);
1356 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1357 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1361 int pipefd[2], pipefd2[2];
1363 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1364 log_error("pipe2(): %m");
1368 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1369 log_error("pipe2(): %m");
1374 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1376 if (errno == EINVAL)
1377 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1379 log_error("clone() failed: %m");
1386 const char *home = NULL;
1387 uid_t uid = (uid_t) -1;
1388 gid_t gid = (gid_t) -1;
1390 const char *envp[] = {
1391 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1392 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1397 NULL, /* container_uuid */
1398 NULL, /* LISTEN_FDS */
1399 NULL, /* LISTEN_PID */
1403 envp[n_env] = strv_find_prefix(environ, "TERM=");
1407 /* Wait for the parent process to log our PID */
1408 close_nointr_nofail(pipefd[1]);
1409 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1410 close_nointr_nofail(pipefd[0]);
1412 close_nointr_nofail(master);
1415 if (saved_attr_valid) {
1416 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1417 log_error("Failed to set terminal attributes: %m");
1422 close_nointr(STDIN_FILENO);
1423 close_nointr(STDOUT_FILENO);
1424 close_nointr(STDERR_FILENO);
1426 close_nointr_nofail(kmsg_socket_pair[0]);
1427 kmsg_socket_pair[0] = -1;
1429 reset_all_signal_handlers();
1431 assert_se(sigemptyset(&mask) == 0);
1432 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1434 k = open_terminal(console, O_RDWR);
1435 if (k != STDIN_FILENO) {
1437 close_nointr_nofail(k);
1441 log_error("Failed to open console: %s", strerror(-k));
1445 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1446 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1447 log_error("Failed to duplicate console: %m");
1452 log_error("setsid() failed: %m");
1456 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1457 log_error("PR_SET_PDEATHSIG failed: %m");
1461 close_pipe(pipefd2);
1463 r = register_machine();
1467 /* Mark everything as slave, so that we still
1468 * receive mounts from the real root, but don't
1469 * propagate mounts to the real root. */
1470 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1471 log_error("MS_SLAVE|MS_REC failed: %m");
1475 /* Turn directory into bind mount */
1476 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1477 log_error("Failed to make bind mount.");
1482 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1483 log_error("Failed to make read-only.");
1487 if (mount_all(arg_directory) < 0)
1490 if (copy_devnodes(arg_directory) < 0)
1493 if (setup_ptmx(arg_directory) < 0)
1496 dev_setup(arg_directory);
1498 if (setup_dev_console(arg_directory, console) < 0)
1501 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1504 close_nointr_nofail(kmsg_socket_pair[1]);
1505 kmsg_socket_pair[1] = -1;
1507 if (setup_boot_id(arg_directory) < 0)
1510 if (setup_timezone(arg_directory) < 0)
1513 if (setup_resolv_conf(arg_directory) < 0)
1516 if (setup_journal(arg_directory) < 0)
1519 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1522 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1525 if (chdir(arg_directory) < 0) {
1526 log_error("chdir(%s) failed: %m", arg_directory);
1530 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1531 log_error("mount(MS_MOVE) failed: %m");
1535 if (chroot(".") < 0) {
1536 log_error("chroot() failed: %m");
1540 if (chdir("/") < 0) {
1541 log_error("chdir() failed: %m");
1549 if (drop_capabilities() < 0) {
1550 log_error("drop_capabilities() failed: %m");
1556 /* Note that this resolves user names
1557 * inside the container, and hence
1558 * accesses the NSS modules from the
1559 * container and not the host. This is
1562 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1563 log_error("get_user_creds() failed: %m");
1567 if (mkdir_parents_label(home, 0775) < 0) {
1568 log_error("mkdir_parents_label() failed: %m");
1572 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1573 log_error("mkdir_safe_label() failed: %m");
1577 if (initgroups((const char*)arg_user, gid) < 0) {
1578 log_error("initgroups() failed: %m");
1582 if (setresgid(gid, gid, gid) < 0) {
1583 log_error("setregid() failed: %m");
1587 if (setresuid(uid, uid, uid) < 0) {
1588 log_error("setreuid() failed: %m");
1592 /* Reset everything fully to 0, just in case */
1594 if (setgroups(0, NULL) < 0) {
1595 log_error("setgroups() failed: %m");
1599 if (setresgid(0, 0, 0) < 0) {
1600 log_error("setregid() failed: %m");
1604 if (setresuid(0, 0, 0) < 0) {
1605 log_error("setreuid() failed: %m");
1610 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1611 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1612 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1617 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1618 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1624 if (fdset_size(fds) > 0) {
1625 k = fdset_cloexec(fds, false);
1627 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1631 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1632 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1644 /* Automatically search for the init system */
1646 l = 1 + argc - optind;
1647 a = newa(char*, l + 1);
1648 memcpy(a + 1, argv + optind, l * sizeof(char*));
1650 a[0] = (char*) "/usr/lib/systemd/systemd";
1651 execve(a[0], a, (char**) envp);
1653 a[0] = (char*) "/lib/systemd/systemd";
1654 execve(a[0], a, (char**) envp);
1656 a[0] = (char*) "/sbin/init";
1657 execve(a[0], a, (char**) envp);
1658 } else if (argc > optind)
1659 execvpe(argv[optind], argv + optind, (char**) envp);
1661 chdir(home ? home : "/root");
1662 execle("/bin/bash", "-bash", NULL, (char**) envp);
1665 log_error("execv() failed: %m");
1668 _exit(EXIT_FAILURE);
1671 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1672 close_nointr_nofail(pipefd[0]);
1673 close_nointr_nofail(pipefd[1]);
1675 /* Wait for the child process to establish cgroup hierarchy */
1676 close_nointr_nofail(pipefd2[1]);
1677 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1678 close_nointr_nofail(pipefd2[0]);
1683 if (process_pty(master, pid, &mask) < 0)
1686 if (saved_attr_valid)
1687 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1689 k = wait_for_terminate(pid, &status);
1695 if (status.si_code == CLD_EXITED) {
1696 r = status.si_status;
1697 if (status.si_status != 0) {
1698 log_error("Container failed with error code %i.", status.si_status);
1702 log_debug("Container exited successfully.");
1704 } else if (status.si_code == CLD_KILLED &&
1705 status.si_status == SIGINT) {
1706 log_info("Container has been shut down.");
1709 } else if (status.si_code == CLD_KILLED &&
1710 status.si_status == SIGHUP) {
1711 log_info("Container is being rebooted.");
1713 } else if (status.si_code == CLD_KILLED ||
1714 status.si_code == CLD_DUMPED) {
1716 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1720 log_error("Container failed due to unknown reason.");
1727 if (saved_attr_valid)
1728 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1730 close_pipe(kmsg_socket_pair);
1735 free(arg_directory);