1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
55 #include "cgroup-util.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
60 #include "dev-setup.h"
64 #include "bus-internal.h"
65 #include "bus-message.h"
71 typedef enum LinkJournal {
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
89 (1ULL << CAP_DAC_OVERRIDE) |
90 (1ULL << CAP_DAC_READ_SEARCH) |
91 (1ULL << CAP_FOWNER) |
92 (1ULL << CAP_FSETID) |
93 (1ULL << CAP_IPC_OWNER) |
96 (1ULL << CAP_LINUX_IMMUTABLE) |
97 (1ULL << CAP_NET_BIND_SERVICE) |
98 (1ULL << CAP_NET_BROADCAST) |
99 (1ULL << CAP_NET_RAW) |
100 (1ULL << CAP_SETGID) |
101 (1ULL << CAP_SETFCAP) |
102 (1ULL << CAP_SETPCAP) |
103 (1ULL << CAP_SETUID) |
104 (1ULL << CAP_SYS_ADMIN) |
105 (1ULL << CAP_SYS_CHROOT) |
106 (1ULL << CAP_SYS_NICE) |
107 (1ULL << CAP_SYS_PTRACE) |
108 (1ULL << CAP_SYS_TTY_CONFIG) |
109 (1ULL << CAP_SYS_RESOURCE) |
110 (1ULL << CAP_SYS_BOOT) |
111 (1ULL << CAP_AUDIT_WRITE) |
112 (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
116 static int help(void) {
118 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120 " -h --help Show this help\n"
121 " --version Print version string\n"
122 " -D --directory=NAME Root directory for the container\n"
123 " -b --boot Boot up full system (i.e. invoke init)\n"
124 " -u --user=USER Run the command under specified user or uid\n"
125 " --uuid=UUID Set a specific machine UUID for the container\n"
126 " -M --machine=NAME Set the machine name for the container\n"
127 " -S --slice=SLICE Place the container in the specified slice\n"
128 " --private-network Disable network in container\n"
129 " --read-only Mount the root directory read-only\n"
130 " --capability=CAP In addition to the default, retain specified\n"
132 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
133 " -j Equivalent to --link-journal=host\n"
134 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
136 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137 program_invocation_short_name);
142 static int parse_argv(int argc, char *argv[]) {
155 static const struct option options[] = {
156 { "help", no_argument, NULL, 'h' },
157 { "version", no_argument, NULL, ARG_VERSION },
158 { "directory", required_argument, NULL, 'D' },
159 { "user", required_argument, NULL, 'u' },
160 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
161 { "boot", no_argument, NULL, 'b' },
162 { "uuid", required_argument, NULL, ARG_UUID },
163 { "read-only", no_argument, NULL, ARG_READ_ONLY },
164 { "capability", required_argument, NULL, ARG_CAPABILITY },
165 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
166 { "bind", required_argument, NULL, ARG_BIND },
167 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
168 { "machine", required_argument, NULL, 'M' },
169 { "slice", required_argument, NULL, 'S' },
178 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
187 puts(PACKAGE_STRING);
188 puts(SYSTEMD_FEATURES);
193 arg_directory = canonicalize_file_name(optarg);
194 if (!arg_directory) {
195 log_error("Failed to canonicalize root directory.");
203 arg_user = strdup(optarg);
209 case ARG_PRIVATE_NETWORK:
210 arg_private_network = true;
218 r = sd_id128_from_string(optarg, &arg_uuid);
220 log_error("Invalid UUID: %s", optarg);
226 arg_slice = strdup(optarg);
230 if (!hostname_is_valid(optarg)) {
231 log_error("Invalid machine name: %s", optarg);
236 arg_machine = strdup(optarg);
243 arg_read_only = true;
246 case ARG_CAPABILITY: {
250 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
254 t = strndup(word, length);
258 if (cap_from_name(t, &cap) < 0) {
259 log_error("Failed to parse capability %s.", t);
265 arg_retain |= 1ULL << (uint64_t) cap;
272 arg_link_journal = LINK_GUEST;
275 case ARG_LINK_JOURNAL:
276 if (streq(optarg, "auto"))
277 arg_link_journal = LINK_AUTO;
278 else if (streq(optarg, "no"))
279 arg_link_journal = LINK_NO;
280 else if (streq(optarg, "guest"))
281 arg_link_journal = LINK_GUEST;
282 else if (streq(optarg, "host"))
283 arg_link_journal = LINK_HOST;
285 log_error("Failed to parse link journal mode %s", optarg);
293 _cleanup_free_ char *a = NULL, *b = NULL;
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
299 e = strchr(optarg, ':');
301 a = strndup(optarg, e - optarg);
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
316 r = strv_extend(x, a);
320 r = strv_extend(x, b);
331 log_error("Unknown option code %c", c);
339 static int mount_all(const char *dest) {
341 typedef struct MountPoint {
350 static const MountPoint mount_table[] = {
351 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
352 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
353 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
354 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
355 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
356 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
360 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
361 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
368 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369 _cleanup_free_ char *where = NULL;
372 where = strjoin(dest, "/", mount_table[k].where, NULL);
376 t = path_is_mount_point(where, true);
378 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
386 /* Skip this entry if it is not a remount. */
387 if (mount_table[k].what && t > 0)
390 mkdir_p(where, 0755);
392 if (mount(mount_table[k].what,
395 mount_table[k].flags,
396 mount_table[k].options) < 0 &&
397 mount_table[k].fatal) {
399 log_error("mount(%s) failed: %m", where);
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
412 STRV_FOREACH_PAIR(x, y, l) {
413 _cleanup_free_ char *where = NULL;
414 struct stat source_st, dest_st;
416 if (stat(*x, &source_st) < 0) {
417 log_error("failed to stat %s: %m", *x);
421 where = strjoin(dest, "/", *y, NULL);
425 if (stat(where, &dest_st) == 0) {
426 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
427 log_error("The file types of %s and %s do not match. Refusing bind mount",
432 /* Create the mount point, but be conservative -- refuse to create block
433 * and char devices. */
434 if (S_ISDIR(source_st.st_mode))
435 mkdir_p_label(where, 0755);
436 else if (S_ISFIFO(source_st.st_mode))
438 else if (S_ISSOCK(source_st.st_mode))
439 mknod(where, 0644 | S_IFSOCK, 0);
440 else if (S_ISREG(source_st.st_mode))
443 log_error("Refusing to create mountpoint for file: %s", *x);
448 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
449 log_error("mount(%s) failed: %m", where);
453 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
454 log_error("mount(%s) failed: %m", where);
462 static int setup_timezone(const char *dest) {
463 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
469 /* Fix the timezone, if possible */
470 r = readlink_malloc("/etc/localtime", &p);
472 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
476 z = path_startswith(p, "../usr/share/zoneinfo/");
478 z = path_startswith(p, "/usr/share/zoneinfo/");
480 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
484 where = strappend(dest, "/etc/localtime");
488 r = readlink_malloc(where, &q);
490 y = path_startswith(q, "../usr/share/zoneinfo/");
492 y = path_startswith(q, "/usr/share/zoneinfo/");
495 /* Already pointing to the right place? Then do nothing .. */
496 if (y && streq(y, z))
500 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
504 if (access(check, F_OK) < 0) {
505 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
509 what = strappend("../usr/share/zoneinfo/", z);
514 if (symlink(what, where) < 0) {
515 log_error("Failed to correct timezone of container: %m");
522 static int setup_resolv_conf(const char *dest) {
523 char _cleanup_free_ *where = NULL;
527 if (arg_private_network)
530 /* Fix resolv.conf, if possible */
531 where = strappend(dest, "/etc/resolv.conf");
535 /* We don't really care for the results of this really. If it
536 * fails, it fails, but meh... */
537 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
542 static int setup_boot_id(const char *dest) {
543 _cleanup_free_ char *from = NULL, *to = NULL;
550 /* Generate a new randomized boot ID, so that each boot-up of
551 * the container gets a new one */
553 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
554 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
558 r = sd_id128_randomize(&rnd);
560 log_error("Failed to generate random boot id: %s", strerror(-r));
564 snprintf(as_uuid, sizeof(as_uuid),
565 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
566 SD_ID128_FORMAT_VAL(rnd));
567 char_array_0(as_uuid);
569 r = write_string_file(from, as_uuid);
571 log_error("Failed to write boot id: %s", strerror(-r));
575 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
576 log_error("Failed to bind mount boot id: %m");
578 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
579 log_warning("Failed to make boot id read-only: %m");
585 static int copy_devnodes(const char *dest) {
587 static const char devnodes[] =
597 _cleanup_umask_ mode_t u;
603 NULSTR_FOREACH(d, devnodes) {
605 _cleanup_free_ char *from = NULL, *to = NULL;
607 asprintf(&from, "/dev/%s", d);
608 asprintf(&to, "%s/dev/%s", dest, d);
619 if (stat(from, &st) < 0) {
621 if (errno != ENOENT) {
622 log_error("Failed to stat %s: %m", from);
627 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
629 log_error("%s is not a char or block device, cannot copy", from);
633 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
635 log_error("mknod(%s) failed: %m", dest);
644 static int setup_ptmx(const char *dest) {
645 _cleanup_free_ char *p = NULL;
647 p = strappend(dest, "/dev/ptmx");
651 if (symlink("pts/ptmx", p) < 0) {
652 log_error("Failed to create /dev/ptmx symlink: %m");
659 static int setup_dev_console(const char *dest, const char *console) {
661 _cleanup_free_ char *to = NULL;
663 _cleanup_umask_ mode_t u;
670 if (stat(console, &st) < 0) {
671 log_error("Failed to stat %s: %m", console);
674 } else if (!S_ISCHR(st.st_mode)) {
675 log_error("/dev/console is not a char device");
679 r = chmod_and_chown(console, 0600, 0, 0);
681 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
685 if (asprintf(&to, "%s/dev/console", dest) < 0)
688 /* We need to bind mount the right tty to /dev/console since
689 * ptys can only exist on pts file systems. To have something
690 * to bind mount things on we create a device node first, that
691 * has the right major/minor (note that the major minor
692 * doesn't actually matter here, since we mount it over
695 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
696 log_error("mknod() for /dev/console failed: %m");
700 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
701 log_error("Bind mount for /dev/console failed: %m");
708 static int setup_kmsg(const char *dest, int kmsg_socket) {
709 _cleanup_free_ char *from = NULL, *to = NULL;
711 _cleanup_umask_ mode_t u;
713 struct cmsghdr cmsghdr;
714 uint8_t buf[CMSG_SPACE(sizeof(int))];
717 .msg_control = &control,
718 .msg_controllen = sizeof(control),
720 struct cmsghdr *cmsg;
723 assert(kmsg_socket >= 0);
727 /* We create the kmsg FIFO as /dev/kmsg, but immediately
728 * delete it after bind mounting it to /proc/kmsg. While FIFOs
729 * on the reading side behave very similar to /proc/kmsg,
730 * their writing side behaves differently from /dev/kmsg in
731 * that writing blocks when nothing is reading. In order to
732 * avoid any problems with containers deadlocking due to this
733 * we simply make /dev/kmsg unavailable to the container. */
734 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
735 asprintf(&to, "%s/proc/kmsg", dest) < 0)
738 if (mkfifo(from, 0600) < 0) {
739 log_error("mkfifo() for /dev/kmsg failed: %m");
743 r = chmod_and_chown(from, 0600, 0, 0);
745 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
749 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
750 log_error("Bind mount for /proc/kmsg failed: %m");
754 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
756 log_error("Failed to open fifo: %m");
760 cmsg = CMSG_FIRSTHDR(&mh);
761 cmsg->cmsg_level = SOL_SOCKET;
762 cmsg->cmsg_type = SCM_RIGHTS;
763 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
764 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
766 mh.msg_controllen = cmsg->cmsg_len;
768 /* Store away the fd in the socket, so that it stays open as
769 * long as we run the child */
770 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
771 close_nointr_nofail(fd);
774 log_error("Failed to send FIFO fd: %m");
778 /* And now make the FIFO unavailable as /dev/kmsg... */
783 static int setup_hostname(void) {
785 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
791 static int setup_journal(const char *directory) {
792 sd_id128_t machine_id;
793 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
797 if (arg_link_journal == LINK_NO)
800 p = strappend(directory, "/etc/machine-id");
804 r = read_one_line_file(p, &b);
805 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
808 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
813 if (isempty(id) && arg_link_journal == LINK_AUTO)
816 /* Verify validity */
817 r = sd_id128_from_string(id, &machine_id);
819 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
824 p = strappend("/var/log/journal/", id);
825 q = strjoin(directory, "/var/log/journal/", id, NULL);
829 if (path_is_mount_point(p, false) > 0) {
830 if (arg_link_journal != LINK_AUTO) {
831 log_error("%s: already a mount point, refusing to use for journal", p);
838 if (path_is_mount_point(q, false) > 0) {
839 if (arg_link_journal != LINK_AUTO) {
840 log_error("%s: already a mount point, refusing to use for journal", q);
847 r = readlink_and_make_absolute(p, &d);
849 if ((arg_link_journal == LINK_GUEST ||
850 arg_link_journal == LINK_AUTO) &&
853 r = mkdir_p(q, 0755);
855 log_warning("failed to create directory %s: %m", q);
860 log_error("Failed to remove symlink %s: %m", p);
863 } else if (r == -EINVAL) {
865 if (arg_link_journal == LINK_GUEST &&
868 if (errno == ENOTDIR) {
869 log_error("%s already exists and is neither a symlink nor a directory", p);
872 log_error("Failed to remove %s: %m", p);
876 } else if (r != -ENOENT) {
877 log_error("readlink(%s) failed: %m", p);
881 if (arg_link_journal == LINK_GUEST) {
883 if (symlink(q, p) < 0) {
884 log_error("Failed to symlink %s to %s: %m", q, p);
888 r = mkdir_p(q, 0755);
890 log_warning("failed to create directory %s: %m", q);
894 if (arg_link_journal == LINK_HOST) {
895 r = mkdir_p(p, 0755);
897 log_error("Failed to create %s: %m", p);
901 } else if (access(p, F_OK) < 0)
904 if (dir_is_empty(q) == 0) {
905 log_error("%s not empty.", q);
909 r = mkdir_p(q, 0755);
911 log_error("Failed to create %s: %m", q);
915 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
916 log_error("Failed to bind mount journal from host into guest: %m");
923 static int drop_capabilities(void) {
924 return capability_bounding_set_drop(~arg_retain, false);
927 static int process_pty(int master, pid_t pid, sigset_t *mask) {
929 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
930 size_t in_buffer_full = 0, out_buffer_full = 0;
931 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
932 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
933 int ep = -1, signal_fd = -1, r;
934 bool tried_orderly_shutdown = false;
940 fd_nonblock(STDIN_FILENO, 1);
941 fd_nonblock(STDOUT_FILENO, 1);
942 fd_nonblock(master, 1);
944 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
946 log_error("signalfd(): %m");
951 ep = epoll_create1(EPOLL_CLOEXEC);
953 log_error("Failed to create epoll: %m");
958 /* We read from STDIN only if this is actually a TTY,
959 * otherwise we assume non-interactivity. */
960 if (isatty(STDIN_FILENO)) {
962 stdin_ev.events = EPOLLIN|EPOLLET;
963 stdin_ev.data.fd = STDIN_FILENO;
965 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
966 log_error("Failed to register STDIN in epoll: %m");
973 stdout_ev.events = EPOLLOUT|EPOLLET;
974 stdout_ev.data.fd = STDOUT_FILENO;
977 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
978 master_ev.data.fd = master;
981 signal_ev.events = EPOLLIN;
982 signal_ev.data.fd = signal_fd;
984 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
985 if (errno != EPERM) {
986 log_error("Failed to register stdout in epoll: %m");
990 /* stdout without epoll support. Likely redirected to regular file. */
991 stdout_writable = true;
994 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
995 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
996 log_error("Failed to register fds in epoll: %m");
1002 struct epoll_event ev[16];
1006 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1009 if (errno == EINTR || errno == EAGAIN)
1012 log_error("epoll_wait(): %m");
1019 for (i = 0; i < nfds; i++) {
1020 if (ev[i].data.fd == STDIN_FILENO) {
1022 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1023 stdin_readable = true;
1025 } else if (ev[i].data.fd == STDOUT_FILENO) {
1027 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1028 stdout_writable = true;
1030 } else if (ev[i].data.fd == master) {
1032 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1033 master_readable = true;
1035 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1036 master_writable = true;
1038 } else if (ev[i].data.fd == signal_fd) {
1039 struct signalfd_siginfo sfsi;
1042 n = read(signal_fd, &sfsi, sizeof(sfsi));
1043 if (n != sizeof(sfsi)) {
1046 log_error("Failed to read from signalfd: invalid block size");
1051 if (errno != EINTR && errno != EAGAIN) {
1052 log_error("Failed to read from signalfd: %m");
1058 if (sfsi.ssi_signo == SIGWINCH) {
1061 /* The window size changed, let's forward that. */
1062 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1063 ioctl(master, TIOCSWINSZ, &ws);
1064 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1066 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1068 /* This only works for systemd... */
1069 tried_orderly_shutdown = true;
1070 kill(pid, SIGRTMIN+3);
1080 while ((stdin_readable && in_buffer_full <= 0) ||
1081 (master_writable && in_buffer_full > 0) ||
1082 (master_readable && out_buffer_full <= 0) ||
1083 (stdout_writable && out_buffer_full > 0)) {
1085 if (stdin_readable && in_buffer_full < LINE_MAX) {
1087 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1090 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1091 stdin_readable = false;
1093 log_error("read(): %m");
1098 in_buffer_full += (size_t) k;
1101 if (master_writable && in_buffer_full > 0) {
1103 k = write(master, in_buffer, in_buffer_full);
1106 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1107 master_writable = false;
1109 log_error("write(): %m");
1115 assert(in_buffer_full >= (size_t) k);
1116 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1117 in_buffer_full -= k;
1121 if (master_readable && out_buffer_full < LINE_MAX) {
1123 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1126 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1127 master_readable = false;
1129 log_error("read(): %m");
1134 out_buffer_full += (size_t) k;
1137 if (stdout_writable && out_buffer_full > 0) {
1139 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1142 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1143 stdout_writable = false;
1145 log_error("write(): %m");
1151 assert(out_buffer_full >= (size_t) k);
1152 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1153 out_buffer_full -= k;
1161 close_nointr_nofail(ep);
1164 close_nointr_nofail(signal_fd);
1169 static int register_machine(void) {
1170 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1171 _cleanup_bus_unref_ sd_bus *bus = NULL;
1174 r = sd_bus_open_system(&bus);
1176 log_error("Failed to open system bus: %s", strerror(-r));
1180 r = sd_bus_call_method(
1182 "org.freedesktop.machine1",
1183 "/org/freedesktop/machine1",
1184 "org.freedesktop.machine1.Manager",
1190 SD_BUS_APPEND_ID128(arg_uuid),
1194 strempty(arg_directory),
1195 1, "Slice", "s", strempty(arg_slice));
1197 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
1204 static bool audit_enabled(void) {
1207 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1209 close_nointr_nofail(fd);
1215 int main(int argc, char *argv[]) {
1217 int r = EXIT_FAILURE, k;
1218 _cleanup_close_ int master = -1;
1220 const char *console = NULL;
1221 struct termios saved_attr, raw_attr;
1223 bool saved_attr_valid = false;
1225 int kmsg_socket_pair[2] = { -1, -1 };
1228 log_parse_environment();
1231 k = parse_argv(argc, argv);
1239 if (arg_directory) {
1242 p = path_make_absolute_cwd(arg_directory);
1243 free(arg_directory);
1246 arg_directory = get_current_dir_name();
1248 if (!arg_directory) {
1249 log_error("Failed to determine path, please use -D.");
1253 path_kill_slashes(arg_directory);
1256 arg_machine = strdup(path_get_file_name(arg_directory));
1262 hostname_cleanup(arg_machine, false);
1263 if (isempty(arg_machine)) {
1264 log_error("Failed to determine machine name automatically, please use -M.");
1269 if (geteuid() != 0) {
1270 log_error("Need to be root.");
1274 if (sd_booted() <= 0) {
1275 log_error("Not running on a systemd system.");
1279 if (arg_boot && audit_enabled()) {
1280 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1281 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1282 "line before using systemd-nspawn. Sleeping for 5s...\n");
1286 if (path_equal(arg_directory, "/")) {
1287 log_error("Spawning container on root directory not supported.");
1291 if (path_is_os_tree(arg_directory) <= 0) {
1292 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1297 n_fd_passed = sd_listen_fds(false);
1298 if (n_fd_passed > 0) {
1299 k = fdset_new_listen_fds(&fds, false);
1301 log_error("Failed to collect file descriptors: %s", strerror(-k));
1305 fdset_close_others(fds);
1308 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1310 log_error("Failed to acquire pseudo tty: %m");
1314 console = ptsname(master);
1316 log_error("Failed to determine tty name: %m");
1320 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1322 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1323 ioctl(master, TIOCSWINSZ, &ws);
1325 if (unlockpt(master) < 0) {
1326 log_error("Failed to unlock tty: %m");
1330 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1331 saved_attr_valid = true;
1333 raw_attr = saved_attr;
1334 cfmakeraw(&raw_attr);
1335 raw_attr.c_lflag &= ~ECHO;
1338 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1339 log_error("Failed to create kmsg socket pair.");
1343 sd_notify(0, "READY=1");
1345 assert_se(sigemptyset(&mask) == 0);
1346 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1347 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1351 int pipefd[2], pipefd2[2];
1353 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1354 log_error("pipe2(): %m");
1358 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1359 log_error("pipe2(): %m");
1364 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1366 if (errno == EINVAL)
1367 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1369 log_error("clone() failed: %m");
1376 const char *home = NULL;
1377 uid_t uid = (uid_t) -1;
1378 gid_t gid = (gid_t) -1;
1380 const char *envp[] = {
1381 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1382 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1387 NULL, /* container_uuid */
1388 NULL, /* LISTEN_FDS */
1389 NULL, /* LISTEN_PID */
1393 envp[n_env] = strv_find_prefix(environ, "TERM=");
1397 /* Wait for the parent process to log our PID */
1398 close_nointr_nofail(pipefd[1]);
1399 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1400 close_nointr_nofail(pipefd[0]);
1402 close_nointr_nofail(master);
1405 if (saved_attr_valid) {
1406 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1407 log_error("Failed to set terminal attributes: %m");
1412 close_nointr(STDIN_FILENO);
1413 close_nointr(STDOUT_FILENO);
1414 close_nointr(STDERR_FILENO);
1416 close_nointr_nofail(kmsg_socket_pair[0]);
1417 kmsg_socket_pair[0] = -1;
1419 reset_all_signal_handlers();
1421 assert_se(sigemptyset(&mask) == 0);
1422 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1424 k = open_terminal(console, O_RDWR);
1425 if (k != STDIN_FILENO) {
1427 close_nointr_nofail(k);
1431 log_error("Failed to open console: %s", strerror(-k));
1435 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1436 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1437 log_error("Failed to duplicate console: %m");
1442 log_error("setsid() failed: %m");
1446 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1447 log_error("PR_SET_PDEATHSIG failed: %m");
1451 close_pipe(pipefd2);
1453 r = register_machine();
1457 /* Mark everything as slave, so that we still
1458 * receive mounts from the real root, but don't
1459 * propagate mounts to the real root. */
1460 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1461 log_error("MS_SLAVE|MS_REC failed: %m");
1465 /* Turn directory into bind mount */
1466 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1467 log_error("Failed to make bind mount.");
1472 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1473 log_error("Failed to make read-only.");
1477 if (mount_all(arg_directory) < 0)
1480 if (copy_devnodes(arg_directory) < 0)
1483 if (setup_ptmx(arg_directory) < 0)
1486 dev_setup(arg_directory);
1488 if (setup_dev_console(arg_directory, console) < 0)
1491 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1494 close_nointr_nofail(kmsg_socket_pair[1]);
1495 kmsg_socket_pair[1] = -1;
1497 if (setup_boot_id(arg_directory) < 0)
1500 if (setup_timezone(arg_directory) < 0)
1503 if (setup_resolv_conf(arg_directory) < 0)
1506 if (setup_journal(arg_directory) < 0)
1509 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1512 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1515 if (chdir(arg_directory) < 0) {
1516 log_error("chdir(%s) failed: %m", arg_directory);
1520 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1521 log_error("mount(MS_MOVE) failed: %m");
1525 if (chroot(".") < 0) {
1526 log_error("chroot() failed: %m");
1530 if (chdir("/") < 0) {
1531 log_error("chdir() failed: %m");
1539 if (drop_capabilities() < 0) {
1540 log_error("drop_capabilities() failed: %m");
1546 /* Note that this resolves user names
1547 * inside the container, and hence
1548 * accesses the NSS modules from the
1549 * container and not the host. This is
1552 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1553 log_error("get_user_creds() failed: %m");
1557 if (mkdir_parents_label(home, 0775) < 0) {
1558 log_error("mkdir_parents_label() failed: %m");
1562 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1563 log_error("mkdir_safe_label() failed: %m");
1567 if (initgroups((const char*)arg_user, gid) < 0) {
1568 log_error("initgroups() failed: %m");
1572 if (setresgid(gid, gid, gid) < 0) {
1573 log_error("setregid() failed: %m");
1577 if (setresuid(uid, uid, uid) < 0) {
1578 log_error("setreuid() failed: %m");
1582 /* Reset everything fully to 0, just in case */
1584 if (setgroups(0, NULL) < 0) {
1585 log_error("setgroups() failed: %m");
1589 if (setresgid(0, 0, 0) < 0) {
1590 log_error("setregid() failed: %m");
1594 if (setresuid(0, 0, 0) < 0) {
1595 log_error("setreuid() failed: %m");
1600 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1601 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1602 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1607 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1608 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1614 if (fdset_size(fds) > 0) {
1615 k = fdset_cloexec(fds, false);
1617 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1621 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1622 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1634 /* Automatically search for the init system */
1636 l = 1 + argc - optind;
1637 a = newa(char*, l + 1);
1638 memcpy(a + 1, argv + optind, l * sizeof(char*));
1640 a[0] = (char*) "/usr/lib/systemd/systemd";
1641 execve(a[0], a, (char**) envp);
1643 a[0] = (char*) "/lib/systemd/systemd";
1644 execve(a[0], a, (char**) envp);
1646 a[0] = (char*) "/sbin/init";
1647 execve(a[0], a, (char**) envp);
1648 } else if (argc > optind)
1649 execvpe(argv[optind], argv + optind, (char**) envp);
1651 chdir(home ? home : "/root");
1652 execle("/bin/bash", "-bash", NULL, (char**) envp);
1655 log_error("execv() failed: %m");
1658 _exit(EXIT_FAILURE);
1661 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1662 close_nointr_nofail(pipefd[0]);
1663 close_nointr_nofail(pipefd[1]);
1665 /* Wait for the child process to establish cgroup hierarchy */
1666 close_nointr_nofail(pipefd2[1]);
1667 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1668 close_nointr_nofail(pipefd2[0]);
1673 if (process_pty(master, pid, &mask) < 0)
1676 if (saved_attr_valid)
1677 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1679 k = wait_for_terminate(pid, &status);
1685 if (status.si_code == CLD_EXITED) {
1686 r = status.si_status;
1687 if (status.si_status != 0) {
1688 log_error("Container failed with error code %i.", status.si_status);
1692 log_debug("Container exited successfully.");
1694 } else if (status.si_code == CLD_KILLED &&
1695 status.si_status == SIGINT) {
1696 log_info("Container has been shut down.");
1699 } else if (status.si_code == CLD_KILLED &&
1700 status.si_status == SIGHUP) {
1701 log_info("Container is being rebooted.");
1703 } else if (status.si_code == CLD_KILLED ||
1704 status.si_code == CLD_DUMPED) {
1706 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1710 log_error("Container failed due to unknown reason.");
1717 if (saved_attr_valid)
1718 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1720 close_pipe(kmsg_socket_pair);
1725 free(arg_directory);