1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
55 #include "cgroup-util.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
60 #include "dev-setup.h"
70 typedef enum LinkJournal {
77 static char *arg_directory = NULL;
78 static char *arg_user = NULL;
79 static sd_id128_t arg_uuid = {};
80 static char *arg_machine = NULL;
81 static const char *arg_slice = NULL;
82 static bool arg_private_network = false;
83 static bool arg_read_only = false;
84 static bool arg_boot = false;
85 static LinkJournal arg_link_journal = LINK_AUTO;
86 static uint64_t arg_retain =
88 (1ULL << CAP_DAC_OVERRIDE) |
89 (1ULL << CAP_DAC_READ_SEARCH) |
90 (1ULL << CAP_FOWNER) |
91 (1ULL << CAP_FSETID) |
92 (1ULL << CAP_IPC_OWNER) |
95 (1ULL << CAP_LINUX_IMMUTABLE) |
96 (1ULL << CAP_NET_BIND_SERVICE) |
97 (1ULL << CAP_NET_BROADCAST) |
98 (1ULL << CAP_NET_RAW) |
99 (1ULL << CAP_SETGID) |
100 (1ULL << CAP_SETFCAP) |
101 (1ULL << CAP_SETPCAP) |
102 (1ULL << CAP_SETUID) |
103 (1ULL << CAP_SYS_ADMIN) |
104 (1ULL << CAP_SYS_CHROOT) |
105 (1ULL << CAP_SYS_NICE) |
106 (1ULL << CAP_SYS_PTRACE) |
107 (1ULL << CAP_SYS_TTY_CONFIG) |
108 (1ULL << CAP_SYS_RESOURCE) |
109 (1ULL << CAP_SYS_BOOT) |
110 (1ULL << CAP_AUDIT_WRITE) |
111 (1ULL << CAP_AUDIT_CONTROL);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
115 static int help(void) {
117 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
118 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
119 " -h --help Show this help\n"
120 " --version Print version string\n"
121 " -D --directory=NAME Root directory for the container\n"
122 " -b --boot Boot up full system (i.e. invoke init)\n"
123 " -u --user=USER Run the command under specified user or uid\n"
124 " --uuid=UUID Set a specific machine UUID for the container\n"
125 " -M --machine=NAME Set the machine name for the container\n"
126 " -S --slice=SLICE Place the container in the specified slice\n"
127 " --private-network Disable network in container\n"
128 " --read-only Mount the root directory read-only\n"
129 " --capability=CAP In addition to the default, retain specified\n"
131 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
132 " -j Equivalent to --link-journal=host\n"
133 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
135 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
136 program_invocation_short_name);
141 static int parse_argv(int argc, char *argv[]) {
154 static const struct option options[] = {
155 { "help", no_argument, NULL, 'h' },
156 { "version", no_argument, NULL, ARG_VERSION },
157 { "directory", required_argument, NULL, 'D' },
158 { "user", required_argument, NULL, 'u' },
159 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
160 { "boot", no_argument, NULL, 'b' },
161 { "uuid", required_argument, NULL, ARG_UUID },
162 { "read-only", no_argument, NULL, ARG_READ_ONLY },
163 { "capability", required_argument, NULL, ARG_CAPABILITY },
164 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
165 { "bind", required_argument, NULL, ARG_BIND },
166 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
167 { "machine", required_argument, NULL, 'M' },
168 { "slice", required_argument, NULL, 'S' },
177 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
186 puts(PACKAGE_STRING);
187 puts(SYSTEMD_FEATURES);
192 arg_directory = canonicalize_file_name(optarg);
193 if (!arg_directory) {
194 log_error("Failed to canonicalize root directory.");
202 arg_user = strdup(optarg);
208 case ARG_PRIVATE_NETWORK:
209 arg_private_network = true;
217 r = sd_id128_from_string(optarg, &arg_uuid);
219 log_error("Invalid UUID: %s", optarg);
225 arg_slice = strdup(optarg);
229 if (!hostname_is_valid(optarg)) {
230 log_error("Invalid machine name: %s", optarg);
235 arg_machine = strdup(optarg);
242 arg_read_only = true;
245 case ARG_CAPABILITY: {
249 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
253 t = strndup(word, length);
257 if (cap_from_name(t, &cap) < 0) {
258 log_error("Failed to parse capability %s.", t);
264 arg_retain |= 1ULL << (uint64_t) cap;
271 arg_link_journal = LINK_GUEST;
274 case ARG_LINK_JOURNAL:
275 if (streq(optarg, "auto"))
276 arg_link_journal = LINK_AUTO;
277 else if (streq(optarg, "no"))
278 arg_link_journal = LINK_NO;
279 else if (streq(optarg, "guest"))
280 arg_link_journal = LINK_GUEST;
281 else if (streq(optarg, "host"))
282 arg_link_journal = LINK_HOST;
284 log_error("Failed to parse link journal mode %s", optarg);
292 _cleanup_free_ char *a = NULL, *b = NULL;
296 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298 e = strchr(optarg, ':');
300 a = strndup(optarg, e - optarg);
310 if (!path_is_absolute(a) || !path_is_absolute(b)) {
311 log_error("Invalid bind mount specification: %s", optarg);
315 r = strv_extend(x, a);
319 r = strv_extend(x, b);
330 log_error("Unknown option code %c", c);
338 static int mount_all(const char *dest) {
340 typedef struct MountPoint {
349 static const MountPoint mount_table[] = {
350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368 _cleanup_free_ char *where = NULL;
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
375 t = path_is_mount_point(where, true);
377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
389 mkdir_p(where, 0755);
391 if (mount(mount_table[k].what,
394 mount_table[k].flags,
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
398 log_error("mount(%s) failed: %m", where);
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
413 struct stat source_st, dest_st;
415 if (stat(*x, &source_st) < 0) {
416 log_error("failed to stat %s: %m", *x);
420 where = strjoin(dest, "/", *y, NULL);
424 if (stat(where, &dest_st) == 0) {
425 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
426 log_error("The file types of %s and %s do not match. Refusing bind mount",
431 /* Create the mount point, but be conservative -- refuse to create block
432 * and char devices. */
433 if (S_ISDIR(source_st.st_mode))
434 mkdir_p_label(where, 0755);
435 else if (S_ISFIFO(source_st.st_mode))
437 else if (S_ISSOCK(source_st.st_mode))
438 mknod(where, 0644 | S_IFSOCK, 0);
439 else if (S_ISREG(source_st.st_mode))
442 log_error("Refusing to create mountpoint for file: %s", *x);
447 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
448 log_error("mount(%s) failed: %m", where);
452 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
453 log_error("mount(%s) failed: %m", where);
461 static int setup_timezone(const char *dest) {
462 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
468 /* Fix the timezone, if possible */
469 r = readlink_malloc("/etc/localtime", &p);
471 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
475 z = path_startswith(p, "../usr/share/zoneinfo/");
477 z = path_startswith(p, "/usr/share/zoneinfo/");
479 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
483 where = strappend(dest, "/etc/localtime");
487 r = readlink_malloc(where, &q);
489 y = path_startswith(q, "../usr/share/zoneinfo/");
491 y = path_startswith(q, "/usr/share/zoneinfo/");
494 /* Already pointing to the right place? Then do nothing .. */
495 if (y && streq(y, z))
499 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
503 if (access(check, F_OK) < 0) {
504 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
508 what = strappend("../usr/share/zoneinfo/", z);
513 if (symlink(what, where) < 0) {
514 log_error("Failed to correct timezone of container: %m");
521 static int setup_resolv_conf(const char *dest) {
522 char _cleanup_free_ *where = NULL;
526 if (arg_private_network)
529 /* Fix resolv.conf, if possible */
530 where = strappend(dest, "/etc/resolv.conf");
534 /* We don't really care for the results of this really. If it
535 * fails, it fails, but meh... */
536 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
541 static int setup_boot_id(const char *dest) {
542 _cleanup_free_ char *from = NULL, *to = NULL;
549 /* Generate a new randomized boot ID, so that each boot-up of
550 * the container gets a new one */
552 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
553 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
557 r = sd_id128_randomize(&rnd);
559 log_error("Failed to generate random boot id: %s", strerror(-r));
563 snprintf(as_uuid, sizeof(as_uuid),
564 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
565 SD_ID128_FORMAT_VAL(rnd));
566 char_array_0(as_uuid);
568 r = write_string_file(from, as_uuid);
570 log_error("Failed to write boot id: %s", strerror(-r));
574 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
575 log_error("Failed to bind mount boot id: %m");
577 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
578 log_warning("Failed to make boot id read-only: %m");
584 static int copy_devnodes(const char *dest) {
586 static const char devnodes[] =
596 _cleanup_umask_ mode_t u;
602 NULSTR_FOREACH(d, devnodes) {
604 _cleanup_free_ char *from = NULL, *to = NULL;
606 asprintf(&from, "/dev/%s", d);
607 asprintf(&to, "%s/dev/%s", dest, d);
618 if (stat(from, &st) < 0) {
620 if (errno != ENOENT) {
621 log_error("Failed to stat %s: %m", from);
626 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
628 log_error("%s is not a char or block device, cannot copy", from);
632 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
634 log_error("mknod(%s) failed: %m", dest);
643 static int setup_ptmx(const char *dest) {
644 _cleanup_free_ char *p = NULL;
646 p = strappend(dest, "/dev/ptmx");
650 if (symlink("pts/ptmx", p) < 0) {
651 log_error("Failed to create /dev/ptmx symlink: %m");
658 static int setup_dev_console(const char *dest, const char *console) {
660 _cleanup_free_ char *to = NULL;
662 _cleanup_umask_ mode_t u;
669 if (stat(console, &st) < 0) {
670 log_error("Failed to stat %s: %m", console);
673 } else if (!S_ISCHR(st.st_mode)) {
674 log_error("/dev/console is not a char device");
678 r = chmod_and_chown(console, 0600, 0, 0);
680 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
684 if (asprintf(&to, "%s/dev/console", dest) < 0)
687 /* We need to bind mount the right tty to /dev/console since
688 * ptys can only exist on pts file systems. To have something
689 * to bind mount things on we create a device node first, that
690 * has the right major/minor (note that the major minor
691 * doesn't actually matter here, since we mount it over
694 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
695 log_error("mknod() for /dev/console failed: %m");
699 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
700 log_error("Bind mount for /dev/console failed: %m");
707 static int setup_kmsg(const char *dest, int kmsg_socket) {
708 _cleanup_free_ char *from = NULL, *to = NULL;
710 _cleanup_umask_ mode_t u;
712 struct cmsghdr cmsghdr;
713 uint8_t buf[CMSG_SPACE(sizeof(int))];
716 .msg_control = &control,
717 .msg_controllen = sizeof(control),
719 struct cmsghdr *cmsg;
722 assert(kmsg_socket >= 0);
726 /* We create the kmsg FIFO as /dev/kmsg, but immediately
727 * delete it after bind mounting it to /proc/kmsg. While FIFOs
728 * on the reading side behave very similar to /proc/kmsg,
729 * their writing side behaves differently from /dev/kmsg in
730 * that writing blocks when nothing is reading. In order to
731 * avoid any problems with containers deadlocking due to this
732 * we simply make /dev/kmsg unavailable to the container. */
733 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
734 asprintf(&to, "%s/proc/kmsg", dest) < 0)
737 if (mkfifo(from, 0600) < 0) {
738 log_error("mkfifo() for /dev/kmsg failed: %m");
742 r = chmod_and_chown(from, 0600, 0, 0);
744 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
748 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
749 log_error("Bind mount for /proc/kmsg failed: %m");
753 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
755 log_error("Failed to open fifo: %m");
759 cmsg = CMSG_FIRSTHDR(&mh);
760 cmsg->cmsg_level = SOL_SOCKET;
761 cmsg->cmsg_type = SCM_RIGHTS;
762 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
763 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
765 mh.msg_controllen = cmsg->cmsg_len;
767 /* Store away the fd in the socket, so that it stays open as
768 * long as we run the child */
769 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
770 close_nointr_nofail(fd);
773 log_error("Failed to send FIFO fd: %m");
777 /* And now make the FIFO unavailable as /dev/kmsg... */
782 static int setup_hostname(void) {
784 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
790 static int setup_journal(const char *directory) {
791 sd_id128_t machine_id;
792 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
796 if (arg_link_journal == LINK_NO)
799 p = strappend(directory, "/etc/machine-id");
803 r = read_one_line_file(p, &b);
804 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
807 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
812 if (isempty(id) && arg_link_journal == LINK_AUTO)
815 /* Verify validity */
816 r = sd_id128_from_string(id, &machine_id);
818 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
823 p = strappend("/var/log/journal/", id);
824 q = strjoin(directory, "/var/log/journal/", id, NULL);
828 if (path_is_mount_point(p, false) > 0) {
829 if (arg_link_journal != LINK_AUTO) {
830 log_error("%s: already a mount point, refusing to use for journal", p);
837 if (path_is_mount_point(q, false) > 0) {
838 if (arg_link_journal != LINK_AUTO) {
839 log_error("%s: already a mount point, refusing to use for journal", q);
846 r = readlink_and_make_absolute(p, &d);
848 if ((arg_link_journal == LINK_GUEST ||
849 arg_link_journal == LINK_AUTO) &&
852 r = mkdir_p(q, 0755);
854 log_warning("failed to create directory %s: %m", q);
859 log_error("Failed to remove symlink %s: %m", p);
862 } else if (r == -EINVAL) {
864 if (arg_link_journal == LINK_GUEST &&
867 if (errno == ENOTDIR) {
868 log_error("%s already exists and is neither a symlink nor a directory", p);
871 log_error("Failed to remove %s: %m", p);
875 } else if (r != -ENOENT) {
876 log_error("readlink(%s) failed: %m", p);
880 if (arg_link_journal == LINK_GUEST) {
882 if (symlink(q, p) < 0) {
883 log_error("Failed to symlink %s to %s: %m", q, p);
887 r = mkdir_p(q, 0755);
889 log_warning("failed to create directory %s: %m", q);
893 if (arg_link_journal == LINK_HOST) {
894 r = mkdir_p(p, 0755);
896 log_error("Failed to create %s: %m", p);
900 } else if (access(p, F_OK) < 0)
903 if (dir_is_empty(q) == 0) {
904 log_error("%s not empty.", q);
908 r = mkdir_p(q, 0755);
910 log_error("Failed to create %s: %m", q);
914 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
915 log_error("Failed to bind mount journal from host into guest: %m");
922 static int drop_capabilities(void) {
923 return capability_bounding_set_drop(~arg_retain, false);
926 static int process_pty(int master, pid_t pid, sigset_t *mask) {
928 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
929 size_t in_buffer_full = 0, out_buffer_full = 0;
930 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
931 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
932 int ep = -1, signal_fd = -1, r;
933 bool tried_orderly_shutdown = false;
939 fd_nonblock(STDIN_FILENO, 1);
940 fd_nonblock(STDOUT_FILENO, 1);
941 fd_nonblock(master, 1);
943 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
945 log_error("signalfd(): %m");
950 ep = epoll_create1(EPOLL_CLOEXEC);
952 log_error("Failed to create epoll: %m");
957 /* We read from STDIN only if this is actually a TTY,
958 * otherwise we assume non-interactivity. */
959 if (isatty(STDIN_FILENO)) {
961 stdin_ev.events = EPOLLIN|EPOLLET;
962 stdin_ev.data.fd = STDIN_FILENO;
964 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
965 log_error("Failed to register STDIN in epoll: %m");
972 stdout_ev.events = EPOLLOUT|EPOLLET;
973 stdout_ev.data.fd = STDOUT_FILENO;
976 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
977 master_ev.data.fd = master;
980 signal_ev.events = EPOLLIN;
981 signal_ev.data.fd = signal_fd;
983 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
984 if (errno != EPERM) {
985 log_error("Failed to register stdout in epoll: %m");
989 /* stdout without epoll support. Likely redirected to regular file. */
990 stdout_writable = true;
993 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
994 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
995 log_error("Failed to register fds in epoll: %m");
1001 struct epoll_event ev[16];
1005 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1008 if (errno == EINTR || errno == EAGAIN)
1011 log_error("epoll_wait(): %m");
1018 for (i = 0; i < nfds; i++) {
1019 if (ev[i].data.fd == STDIN_FILENO) {
1021 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1022 stdin_readable = true;
1024 } else if (ev[i].data.fd == STDOUT_FILENO) {
1026 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1027 stdout_writable = true;
1029 } else if (ev[i].data.fd == master) {
1031 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1032 master_readable = true;
1034 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1035 master_writable = true;
1037 } else if (ev[i].data.fd == signal_fd) {
1038 struct signalfd_siginfo sfsi;
1041 n = read(signal_fd, &sfsi, sizeof(sfsi));
1042 if (n != sizeof(sfsi)) {
1045 log_error("Failed to read from signalfd: invalid block size");
1050 if (errno != EINTR && errno != EAGAIN) {
1051 log_error("Failed to read from signalfd: %m");
1057 if (sfsi.ssi_signo == SIGWINCH) {
1060 /* The window size changed, let's forward that. */
1061 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1062 ioctl(master, TIOCSWINSZ, &ws);
1063 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1065 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1067 /* This only works for systemd... */
1068 tried_orderly_shutdown = true;
1069 kill(pid, SIGRTMIN+3);
1079 while ((stdin_readable && in_buffer_full <= 0) ||
1080 (master_writable && in_buffer_full > 0) ||
1081 (master_readable && out_buffer_full <= 0) ||
1082 (stdout_writable && out_buffer_full > 0)) {
1084 if (stdin_readable && in_buffer_full < LINE_MAX) {
1086 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1089 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1090 stdin_readable = false;
1092 log_error("read(): %m");
1097 in_buffer_full += (size_t) k;
1100 if (master_writable && in_buffer_full > 0) {
1102 k = write(master, in_buffer, in_buffer_full);
1105 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1106 master_writable = false;
1108 log_error("write(): %m");
1114 assert(in_buffer_full >= (size_t) k);
1115 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1116 in_buffer_full -= k;
1120 if (master_readable && out_buffer_full < LINE_MAX) {
1122 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1125 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1126 master_readable = false;
1128 log_error("read(): %m");
1133 out_buffer_full += (size_t) k;
1136 if (stdout_writable && out_buffer_full > 0) {
1138 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1141 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1142 stdout_writable = false;
1144 log_error("write(): %m");
1150 assert(out_buffer_full >= (size_t) k);
1151 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1152 out_buffer_full -= k;
1160 close_nointr_nofail(ep);
1163 close_nointr_nofail(signal_fd);
1168 static int register_machine(void) {
1169 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1170 _cleanup_bus_unref_ sd_bus *bus = NULL;
1173 r = sd_bus_open_system(&bus);
1175 log_error("Failed to open system bus: %s", strerror(-r));
1179 r = sd_bus_call_method(
1181 "org.freedesktop.machine1",
1182 "/org/freedesktop/machine1",
1183 "org.freedesktop.machine1.Manager",
1189 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1193 strempty(arg_directory),
1194 1, "Slice", "s", strempty(arg_slice));
1196 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
1203 static bool audit_enabled(void) {
1206 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1208 close_nointr_nofail(fd);
1214 int main(int argc, char *argv[]) {
1216 int r = EXIT_FAILURE, k;
1217 _cleanup_close_ int master = -1;
1219 const char *console = NULL;
1220 struct termios saved_attr, raw_attr;
1222 bool saved_attr_valid = false;
1224 int kmsg_socket_pair[2] = { -1, -1 };
1225 _cleanup_fdset_free_ FDSet *fds = NULL;
1227 log_parse_environment();
1230 k = parse_argv(argc, argv);
1238 if (arg_directory) {
1241 p = path_make_absolute_cwd(arg_directory);
1242 free(arg_directory);
1245 arg_directory = get_current_dir_name();
1247 if (!arg_directory) {
1248 log_error("Failed to determine path, please use -D.");
1252 path_kill_slashes(arg_directory);
1255 arg_machine = strdup(path_get_file_name(arg_directory));
1261 hostname_cleanup(arg_machine, false);
1262 if (isempty(arg_machine)) {
1263 log_error("Failed to determine machine name automatically, please use -M.");
1268 if (geteuid() != 0) {
1269 log_error("Need to be root.");
1273 if (sd_booted() <= 0) {
1274 log_error("Not running on a systemd system.");
1278 if (arg_boot && audit_enabled()) {
1279 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1280 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1281 "line before using systemd-nspawn. Sleeping for 5s...\n");
1285 if (path_equal(arg_directory, "/")) {
1286 log_error("Spawning container on root directory not supported.");
1290 if (path_is_os_tree(arg_directory) <= 0) {
1291 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1296 n_fd_passed = sd_listen_fds(false);
1297 if (n_fd_passed > 0) {
1298 k = fdset_new_listen_fds(&fds, false);
1300 log_error("Failed to collect file descriptors: %s", strerror(-k));
1304 fdset_close_others(fds);
1307 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1309 log_error("Failed to acquire pseudo tty: %m");
1313 console = ptsname(master);
1315 log_error("Failed to determine tty name: %m");
1319 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1321 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1322 ioctl(master, TIOCSWINSZ, &ws);
1324 if (unlockpt(master) < 0) {
1325 log_error("Failed to unlock tty: %m");
1329 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1330 saved_attr_valid = true;
1332 raw_attr = saved_attr;
1333 cfmakeraw(&raw_attr);
1334 raw_attr.c_lflag &= ~ECHO;
1337 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1338 log_error("Failed to create kmsg socket pair.");
1342 sd_notify(0, "READY=1");
1344 assert_se(sigemptyset(&mask) == 0);
1345 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1346 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1350 int pipefd[2], pipefd2[2];
1352 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1353 log_error("pipe2(): %m");
1357 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1358 log_error("pipe2(): %m");
1363 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1365 if (errno == EINVAL)
1366 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1368 log_error("clone() failed: %m");
1375 const char *home = NULL;
1376 uid_t uid = (uid_t) -1;
1377 gid_t gid = (gid_t) -1;
1379 const char *envp[] = {
1380 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1381 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1386 NULL, /* container_uuid */
1387 NULL, /* LISTEN_FDS */
1388 NULL, /* LISTEN_PID */
1392 envp[n_env] = strv_find_prefix(environ, "TERM=");
1396 /* Wait for the parent process to log our PID */
1397 close_nointr_nofail(pipefd[1]);
1398 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1399 close_nointr_nofail(pipefd[0]);
1401 close_nointr_nofail(master);
1404 if (saved_attr_valid) {
1405 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1406 log_error("Failed to set terminal attributes: %m");
1411 close_nointr(STDIN_FILENO);
1412 close_nointr(STDOUT_FILENO);
1413 close_nointr(STDERR_FILENO);
1415 close_nointr_nofail(kmsg_socket_pair[0]);
1416 kmsg_socket_pair[0] = -1;
1418 reset_all_signal_handlers();
1420 assert_se(sigemptyset(&mask) == 0);
1421 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1423 k = open_terminal(console, O_RDWR);
1424 if (k != STDIN_FILENO) {
1426 close_nointr_nofail(k);
1430 log_error("Failed to open console: %s", strerror(-k));
1434 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1435 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1436 log_error("Failed to duplicate console: %m");
1441 log_error("setsid() failed: %m");
1445 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1446 log_error("PR_SET_PDEATHSIG failed: %m");
1450 close_pipe(pipefd2);
1452 r = register_machine();
1456 /* Mark everything as slave, so that we still
1457 * receive mounts from the real root, but don't
1458 * propagate mounts to the real root. */
1459 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1460 log_error("MS_SLAVE|MS_REC failed: %m");
1464 /* Turn directory into bind mount */
1465 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1466 log_error("Failed to make bind mount.");
1471 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1472 log_error("Failed to make read-only.");
1476 if (mount_all(arg_directory) < 0)
1479 if (copy_devnodes(arg_directory) < 0)
1482 if (setup_ptmx(arg_directory) < 0)
1485 dev_setup(arg_directory);
1487 if (setup_dev_console(arg_directory, console) < 0)
1490 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1493 close_nointr_nofail(kmsg_socket_pair[1]);
1494 kmsg_socket_pair[1] = -1;
1496 if (setup_boot_id(arg_directory) < 0)
1499 if (setup_timezone(arg_directory) < 0)
1502 if (setup_resolv_conf(arg_directory) < 0)
1505 if (setup_journal(arg_directory) < 0)
1508 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1511 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1514 if (chdir(arg_directory) < 0) {
1515 log_error("chdir(%s) failed: %m", arg_directory);
1519 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1520 log_error("mount(MS_MOVE) failed: %m");
1524 if (chroot(".") < 0) {
1525 log_error("chroot() failed: %m");
1529 if (chdir("/") < 0) {
1530 log_error("chdir() failed: %m");
1538 if (drop_capabilities() < 0) {
1539 log_error("drop_capabilities() failed: %m");
1545 /* Note that this resolves user names
1546 * inside the container, and hence
1547 * accesses the NSS modules from the
1548 * container and not the host. This is
1551 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1552 log_error("get_user_creds() failed: %m");
1556 if (mkdir_parents_label(home, 0775) < 0) {
1557 log_error("mkdir_parents_label() failed: %m");
1561 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1562 log_error("mkdir_safe_label() failed: %m");
1566 if (initgroups((const char*)arg_user, gid) < 0) {
1567 log_error("initgroups() failed: %m");
1571 if (setresgid(gid, gid, gid) < 0) {
1572 log_error("setregid() failed: %m");
1576 if (setresuid(uid, uid, uid) < 0) {
1577 log_error("setreuid() failed: %m");
1581 /* Reset everything fully to 0, just in case */
1583 if (setgroups(0, NULL) < 0) {
1584 log_error("setgroups() failed: %m");
1588 if (setresgid(0, 0, 0) < 0) {
1589 log_error("setregid() failed: %m");
1593 if (setresuid(0, 0, 0) < 0) {
1594 log_error("setreuid() failed: %m");
1599 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1600 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1601 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1606 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1607 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1613 if (fdset_size(fds) > 0) {
1614 k = fdset_cloexec(fds, false);
1616 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1620 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1621 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1633 /* Automatically search for the init system */
1635 l = 1 + argc - optind;
1636 a = newa(char*, l + 1);
1637 memcpy(a + 1, argv + optind, l * sizeof(char*));
1639 a[0] = (char*) "/usr/lib/systemd/systemd";
1640 execve(a[0], a, (char**) envp);
1642 a[0] = (char*) "/lib/systemd/systemd";
1643 execve(a[0], a, (char**) envp);
1645 a[0] = (char*) "/sbin/init";
1646 execve(a[0], a, (char**) envp);
1647 } else if (argc > optind)
1648 execvpe(argv[optind], argv + optind, (char**) envp);
1650 chdir(home ? home : "/root");
1651 execle("/bin/bash", "-bash", NULL, (char**) envp);
1654 log_error("execv() failed: %m");
1657 _exit(EXIT_FAILURE);
1660 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1661 close_nointr_nofail(pipefd[0]);
1662 close_nointr_nofail(pipefd[1]);
1664 /* Wait for the child process to establish cgroup hierarchy */
1665 close_nointr_nofail(pipefd2[1]);
1666 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1667 close_nointr_nofail(pipefd2[0]);
1672 if (process_pty(master, pid, &mask) < 0)
1675 if (saved_attr_valid)
1676 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1678 k = wait_for_terminate(pid, &status);
1684 if (status.si_code == CLD_EXITED) {
1685 r = status.si_status;
1686 if (status.si_status != 0) {
1687 log_error("Container failed with error code %i.", status.si_status);
1691 log_debug("Container exited successfully.");
1693 } else if (status.si_code == CLD_KILLED &&
1694 status.si_status == SIGINT) {
1695 log_info("Container has been shut down.");
1698 } else if (status.si_code == CLD_KILLED &&
1699 status.si_status == SIGHUP) {
1700 log_info("Container is being rebooted.");
1702 } else if (status.si_code == CLD_KILLED ||
1703 status.si_code == CLD_DUMPED) {
1705 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1709 log_error("Container failed due to unknown reason.");
1716 if (saved_attr_valid)
1717 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1719 close_pipe(kmsg_socket_pair);
1724 free(arg_directory);