1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
67 typedef enum LinkJournal {
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static bool arg_private_network = false;
79 static bool arg_read_only = false;
80 static bool arg_boot = false;
81 static LinkJournal arg_link_journal = LINK_AUTO;
82 static uint64_t arg_retain =
84 (1ULL << CAP_DAC_OVERRIDE) |
85 (1ULL << CAP_DAC_READ_SEARCH) |
86 (1ULL << CAP_FOWNER) |
87 (1ULL << CAP_FSETID) |
88 (1ULL << CAP_IPC_OWNER) |
91 (1ULL << CAP_LINUX_IMMUTABLE) |
92 (1ULL << CAP_NET_BIND_SERVICE) |
93 (1ULL << CAP_NET_BROADCAST) |
94 (1ULL << CAP_NET_RAW) |
95 (1ULL << CAP_SETGID) |
96 (1ULL << CAP_SETFCAP) |
97 (1ULL << CAP_SETPCAP) |
98 (1ULL << CAP_SETUID) |
99 (1ULL << CAP_SYS_ADMIN) |
100 (1ULL << CAP_SYS_CHROOT) |
101 (1ULL << CAP_SYS_NICE) |
102 (1ULL << CAP_SYS_PTRACE) |
103 (1ULL << CAP_SYS_TTY_CONFIG) |
104 (1ULL << CAP_SYS_RESOURCE) |
105 (1ULL << CAP_SYS_BOOT) |
106 (1ULL << CAP_AUDIT_WRITE) |
107 (1ULL << CAP_AUDIT_CONTROL);
108 static char **arg_bind = NULL;
109 static char **arg_bind_ro = NULL;
111 static int help(void) {
113 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
114 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
115 " -h --help Show this help\n"
116 " --version Print version string\n"
117 " -D --directory=NAME Root directory for the container\n"
118 " -b --boot Boot up full system (i.e. invoke init)\n"
119 " -u --user=USER Run the command under specified user or uid\n"
120 " -C --controllers=LIST Put the container in specified comma-separated\n"
121 " cgroup hierarchies\n"
122 " --uuid=UUID Set a specific machine UUID for the container\n"
123 " --private-network Disable network in container\n"
124 " --read-only Mount the root directory read-only\n"
125 " --capability=CAP In addition to the default, retain specified\n"
127 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
128 " -j Equivalent to --link-journal=host\n"
129 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
131 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
132 program_invocation_short_name);
137 static int parse_argv(int argc, char *argv[]) {
150 static const struct option options[] = {
151 { "help", no_argument, NULL, 'h' },
152 { "version", no_argument, NULL, ARG_VERSION },
153 { "directory", required_argument, NULL, 'D' },
154 { "user", required_argument, NULL, 'u' },
155 { "controllers", required_argument, NULL, 'C' },
156 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
157 { "boot", no_argument, NULL, 'b' },
158 { "uuid", required_argument, NULL, ARG_UUID },
159 { "read-only", no_argument, NULL, ARG_READ_ONLY },
160 { "capability", required_argument, NULL, ARG_CAPABILITY },
161 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
162 { "bind", required_argument, NULL, ARG_BIND },
163 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
172 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
181 puts(PACKAGE_STRING);
182 puts(SYSTEMD_FEATURES);
187 arg_directory = canonicalize_file_name(optarg);
188 if (!arg_directory) {
189 log_error("Failed to canonicalize root directory.");
197 if (!(arg_user = strdup(optarg))) {
198 log_error("Failed to duplicate user name.");
205 strv_free(arg_controllers);
206 arg_controllers = strv_split(optarg, ",");
207 if (!arg_controllers) {
208 log_error("Failed to split controllers list.");
211 strv_uniq(arg_controllers);
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
228 arg_read_only = true;
231 case ARG_CAPABILITY: {
235 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
239 t = strndup(word, length);
243 if (cap_from_name(t, &cap) < 0) {
244 log_error("Failed to parse capability %s.", t);
250 arg_retain |= 1ULL << (uint64_t) cap;
257 arg_link_journal = LINK_GUEST;
260 case ARG_LINK_JOURNAL:
261 if (streq(optarg, "auto"))
262 arg_link_journal = LINK_AUTO;
263 else if (streq(optarg, "no"))
264 arg_link_journal = LINK_NO;
265 else if (streq(optarg, "guest"))
266 arg_link_journal = LINK_GUEST;
267 else if (streq(optarg, "host"))
268 arg_link_journal = LINK_HOST;
270 log_error("Failed to parse link journal mode %s", optarg);
278 _cleanup_free_ char *a = NULL, *b = NULL;
283 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
285 e = strchr(optarg, ':');
287 a = strndup(optarg, e - optarg);
297 if (!path_is_absolute(a) || !path_is_absolute(b)) {
298 log_error("Invalid bind mount specification: %s", optarg);
302 r = strv_extend(x, a);
306 r = strv_extend(x, b);
317 log_error("Unknown option code %c", c);
325 static int mount_all(const char *dest) {
327 typedef struct MountPoint {
336 static const MountPoint mount_table[] = {
337 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
338 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
339 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
340 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
341 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
342 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
343 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
344 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
346 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
347 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
354 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
355 char _cleanup_free_ *where = NULL;
358 where = strjoin(dest, "/", mount_table[k].where, NULL);
362 t = path_is_mount_point(where, true);
364 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
372 /* Skip this entry if it is not a remount. */
373 if (mount_table[k].what && t > 0)
376 mkdir_p(where, 0755);
378 if (mount(mount_table[k].what,
381 mount_table[k].flags,
382 mount_table[k].options) < 0 &&
383 mount_table[k].fatal) {
385 log_error("mount(%s) failed: %m", where);
395 static int mount_binds(const char *dest, char **l, unsigned long flags) {
398 STRV_FOREACH_PAIR(x, y, l) {
399 _cleanup_free_ char *where = NULL;
401 where = strjoin(dest, "/", *y, NULL);
405 mkdir_p_label(where, 0755);
407 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
408 log_error("mount(%s) failed: %m", where);
412 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
413 log_error("mount(%s) failed: %m", where);
421 static int setup_timezone(const char *dest) {
422 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
428 /* Fix the timezone, if possible */
429 r = readlink_malloc("/etc/localtime", &p);
431 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
435 z = path_startswith(p, "../usr/share/zoneinfo/");
437 z = path_startswith(p, "/usr/share/zoneinfo/");
439 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
443 where = strappend(dest, "/etc/localtime");
447 r = readlink_malloc(where, &q);
449 y = path_startswith(q, "../usr/share/zoneinfo/");
451 y = path_startswith(q, "/usr/share/zoneinfo/");
454 /* Already pointing to the right place? Then do nothing .. */
455 if (y && streq(y, z))
459 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
463 if (access(check, F_OK) < 0) {
464 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
468 what = strappend("../usr/share/zoneinfo/", z);
473 if (symlink(what, where) < 0) {
474 log_error("Failed to correct timezone of container: %m");
481 static int setup_resolv_conf(const char *dest) {
486 if (arg_private_network)
489 /* Fix resolv.conf, if possible */
490 where = strappend(dest, "/etc/resolv.conf");
494 /* We don't really care for the results of this really. If it
495 * fails, it fails, but meh... */
496 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
497 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
504 static int setup_boot_id(const char *dest) {
505 char _cleanup_free_ *from = NULL, *to = NULL;
512 /* Generate a new randomized boot ID, so that each boot-up of
513 * the container gets a new one */
515 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
516 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
520 r = sd_id128_randomize(&rnd);
522 log_error("Failed to generate random boot id: %s", strerror(-r));
526 snprintf(as_uuid, sizeof(as_uuid),
527 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
528 SD_ID128_FORMAT_VAL(rnd));
529 char_array_0(as_uuid);
531 r = write_string_file(from, as_uuid);
533 log_error("Failed to write boot id: %s", strerror(-r));
537 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
538 log_error("Failed to bind mount boot id: %m");
540 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
541 log_warning("Failed to make boot id read-only: %m");
547 static int copy_devnodes(const char *dest) {
549 static const char devnodes[] =
559 mode_t _cleanup_umask_ u;
565 NULSTR_FOREACH(d, devnodes) {
567 char _cleanup_free_ *from = NULL, *to = NULL;
569 asprintf(&from, "/dev/%s", d);
570 asprintf(&to, "%s/dev/%s", dest, d);
581 if (stat(from, &st) < 0) {
583 if (errno != ENOENT) {
584 log_error("Failed to stat %s: %m", from);
589 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
591 log_error("%s is not a char or block device, cannot copy", from);
595 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
597 log_error("mknod(%s) failed: %m", dest);
606 static int setup_ptmx(const char *dest) {
607 _cleanup_free_ char *p = NULL;
609 p = strappend(dest, "/dev/ptmx");
613 if (symlink("pts/ptmx", p) < 0) {
614 log_error("Failed to create /dev/ptmx symlink: %m");
621 static int setup_dev_console(const char *dest, const char *console) {
623 char _cleanup_free_ *to = NULL;
625 mode_t _cleanup_umask_ u;
632 if (stat(console, &st) < 0) {
633 log_error("Failed to stat %s: %m", console);
636 } else if (!S_ISCHR(st.st_mode)) {
637 log_error("/dev/console is not a char device");
641 r = chmod_and_chown(console, 0600, 0, 0);
643 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
647 if (asprintf(&to, "%s/dev/console", dest) < 0)
650 /* We need to bind mount the right tty to /dev/console since
651 * ptys can only exist on pts file systems. To have something
652 * to bind mount things on we create a device node first, that
653 * has the right major/minor (note that the major minor
654 * doesn't actually matter here, since we mount it over
657 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
658 log_error("mknod() for /dev/console failed: %m");
662 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
663 log_error("Bind mount for /dev/console failed: %m");
670 static int setup_kmsg(const char *dest, int kmsg_socket) {
671 char _cleanup_free_ *from = NULL, *to = NULL;
673 mode_t _cleanup_umask_ u;
675 struct cmsghdr cmsghdr;
676 uint8_t buf[CMSG_SPACE(sizeof(int))];
679 .msg_control = &control,
680 .msg_controllen = sizeof(control),
682 struct cmsghdr *cmsg;
685 assert(kmsg_socket >= 0);
689 /* We create the kmsg FIFO as /dev/kmsg, but immediately
690 * delete it after bind mounting it to /proc/kmsg. While FIFOs
691 * on the reading side behave very similar to /proc/kmsg,
692 * their writing side behaves differently from /dev/kmsg in
693 * that writing blocks when nothing is reading. In order to
694 * avoid any problems with containers deadlocking due to this
695 * we simply make /dev/kmsg unavailable to the container. */
696 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
697 asprintf(&to, "%s/proc/kmsg", dest) < 0)
700 if (mkfifo(from, 0600) < 0) {
701 log_error("mkfifo() for /dev/kmsg failed: %m");
705 r = chmod_and_chown(from, 0600, 0, 0);
707 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
711 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
712 log_error("Bind mount for /proc/kmsg failed: %m");
716 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
718 log_error("Failed to open fifo: %m");
722 cmsg = CMSG_FIRSTHDR(&mh);
723 cmsg->cmsg_level = SOL_SOCKET;
724 cmsg->cmsg_type = SCM_RIGHTS;
725 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
726 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
728 mh.msg_controllen = cmsg->cmsg_len;
730 /* Store away the fd in the socket, so that it stays open as
731 * long as we run the child */
732 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
733 close_nointr_nofail(fd);
736 log_error("Failed to send FIFO fd: %m");
740 /* And now make the FIFO unavailable as /dev/kmsg... */
745 static int setup_hostname(void) {
749 hn = path_get_file_name(arg_directory);
755 hostname_cleanup(hn);
758 if (sethostname(hn, strlen(hn)) < 0)
767 static int setup_journal(const char *directory) {
768 sd_id128_t machine_id;
769 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
773 if (arg_link_journal == LINK_NO)
776 p = strappend(directory, "/etc/machine-id");
780 r = read_one_line_file(p, &b);
781 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
784 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
789 if (isempty(id) && arg_link_journal == LINK_AUTO)
792 /* Verify validity */
793 r = sd_id128_from_string(id, &machine_id);
795 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
800 p = strappend("/var/log/journal/", id);
801 q = strjoin(directory, "/var/log/journal/", id, NULL);
805 if (path_is_mount_point(p, false) > 0) {
806 if (arg_link_journal != LINK_AUTO) {
807 log_error("%s: already a mount point, refusing to use for journal", p);
814 if (path_is_mount_point(q, false) > 0) {
815 if (arg_link_journal != LINK_AUTO) {
816 log_error("%s: already a mount point, refusing to use for journal", q);
823 r = readlink_and_make_absolute(p, &d);
825 if ((arg_link_journal == LINK_GUEST ||
826 arg_link_journal == LINK_AUTO) &&
829 r = mkdir_p(q, 0755);
831 log_warning("failed to create directory %s: %m", q);
836 log_error("Failed to remove symlink %s: %m", p);
839 } else if (r == -EINVAL) {
841 if (arg_link_journal == LINK_GUEST &&
844 if (errno == ENOTDIR) {
845 log_error("%s already exists and is neither a symlink nor a directory", p);
848 log_error("Failed to remove %s: %m", p);
852 } else if (r != -ENOENT) {
853 log_error("readlink(%s) failed: %m", p);
857 if (arg_link_journal == LINK_GUEST) {
859 if (symlink(q, p) < 0) {
860 log_error("Failed to symlink %s to %s: %m", q, p);
864 r = mkdir_p(q, 0755);
866 log_warning("failed to create directory %s: %m", q);
870 if (arg_link_journal == LINK_HOST) {
871 r = mkdir_p(p, 0755);
873 log_error("Failed to create %s: %m", p);
877 } else if (access(p, F_OK) < 0)
880 if (dir_is_empty(q) == 0) {
881 log_error("%s not empty.", q);
885 r = mkdir_p(q, 0755);
887 log_error("Failed to create %s: %m", q);
891 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
892 log_error("Failed to bind mount journal from host into guest: %m");
899 static int drop_capabilities(void) {
900 return capability_bounding_set_drop(~arg_retain, false);
903 static int is_os_tree(const char *path) {
906 /* We use /bin/sh as flag file if something is an OS */
908 if (asprintf(&p, "%s/bin/sh", path) < 0)
914 return r < 0 ? 0 : 1;
917 static int process_pty(int master, pid_t pid, sigset_t *mask) {
919 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
920 size_t in_buffer_full = 0, out_buffer_full = 0;
921 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
922 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
923 int ep = -1, signal_fd = -1, r;
924 bool tried_orderly_shutdown = false;
930 fd_nonblock(STDIN_FILENO, 1);
931 fd_nonblock(STDOUT_FILENO, 1);
932 fd_nonblock(master, 1);
934 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
936 log_error("signalfd(): %m");
941 ep = epoll_create1(EPOLL_CLOEXEC);
943 log_error("Failed to create epoll: %m");
948 /* We read from STDIN only if this is actually a TTY,
949 * otherwise we assume non-interactivity. */
950 if (isatty(STDIN_FILENO)) {
952 stdin_ev.events = EPOLLIN|EPOLLET;
953 stdin_ev.data.fd = STDIN_FILENO;
955 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
956 log_error("Failed to register STDIN in epoll: %m");
963 stdout_ev.events = EPOLLOUT|EPOLLET;
964 stdout_ev.data.fd = STDOUT_FILENO;
967 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
968 master_ev.data.fd = master;
971 signal_ev.events = EPOLLIN;
972 signal_ev.data.fd = signal_fd;
974 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
975 if (errno != EPERM) {
976 log_error("Failed to register stdout in epoll: %m");
980 /* stdout without epoll support. Likely redirected to regular file. */
981 stdout_writable = true;
984 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
985 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
986 log_error("Failed to register fds in epoll: %m");
992 struct epoll_event ev[16];
996 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
999 if (errno == EINTR || errno == EAGAIN)
1002 log_error("epoll_wait(): %m");
1009 for (i = 0; i < nfds; i++) {
1010 if (ev[i].data.fd == STDIN_FILENO) {
1012 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1013 stdin_readable = true;
1015 } else if (ev[i].data.fd == STDOUT_FILENO) {
1017 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1018 stdout_writable = true;
1020 } else if (ev[i].data.fd == master) {
1022 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1023 master_readable = true;
1025 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1026 master_writable = true;
1028 } else if (ev[i].data.fd == signal_fd) {
1029 struct signalfd_siginfo sfsi;
1032 n = read(signal_fd, &sfsi, sizeof(sfsi));
1033 if (n != sizeof(sfsi)) {
1036 log_error("Failed to read from signalfd: invalid block size");
1041 if (errno != EINTR && errno != EAGAIN) {
1042 log_error("Failed to read from signalfd: %m");
1048 if (sfsi.ssi_signo == SIGWINCH) {
1051 /* The window size changed, let's forward that. */
1052 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1053 ioctl(master, TIOCSWINSZ, &ws);
1054 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1056 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1058 /* This only works for systemd... */
1059 tried_orderly_shutdown = true;
1060 kill(pid, SIGRTMIN+3);
1070 while ((stdin_readable && in_buffer_full <= 0) ||
1071 (master_writable && in_buffer_full > 0) ||
1072 (master_readable && out_buffer_full <= 0) ||
1073 (stdout_writable && out_buffer_full > 0)) {
1075 if (stdin_readable && in_buffer_full < LINE_MAX) {
1077 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1080 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1081 stdin_readable = false;
1083 log_error("read(): %m");
1088 in_buffer_full += (size_t) k;
1091 if (master_writable && in_buffer_full > 0) {
1093 k = write(master, in_buffer, in_buffer_full);
1096 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1097 master_writable = false;
1099 log_error("write(): %m");
1105 assert(in_buffer_full >= (size_t) k);
1106 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1107 in_buffer_full -= k;
1111 if (master_readable && out_buffer_full < LINE_MAX) {
1113 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1116 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1117 master_readable = false;
1119 log_error("read(): %m");
1124 out_buffer_full += (size_t) k;
1127 if (stdout_writable && out_buffer_full > 0) {
1129 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1132 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1133 stdout_writable = false;
1135 log_error("write(): %m");
1141 assert(out_buffer_full >= (size_t) k);
1142 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1143 out_buffer_full -= k;
1151 close_nointr_nofail(ep);
1154 close_nointr_nofail(signal_fd);
1159 int main(int argc, char *argv[]) {
1161 int r = EXIT_FAILURE, k;
1162 char *oldcg = NULL, *newcg = NULL;
1163 char **controller = NULL;
1164 int master = -1, n_fd_passed;
1165 const char *console = NULL;
1166 struct termios saved_attr, raw_attr;
1168 bool saved_attr_valid = false;
1170 int kmsg_socket_pair[2] = { -1, -1 };
1173 log_parse_environment();
1176 r = parse_argv(argc, argv);
1180 if (arg_directory) {
1183 p = path_make_absolute_cwd(arg_directory);
1184 free(arg_directory);
1187 arg_directory = get_current_dir_name();
1189 if (!arg_directory) {
1190 log_error("Failed to determine path");
1194 path_kill_slashes(arg_directory);
1196 if (geteuid() != 0) {
1197 log_error("Need to be root.");
1201 if (sd_booted() <= 0) {
1202 log_error("Not running on a systemd system.");
1206 if (path_equal(arg_directory, "/")) {
1207 log_error("Spawning container on root directory not supported.");
1211 if (is_os_tree(arg_directory) <= 0) {
1212 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1217 n_fd_passed = sd_listen_fds(false);
1218 if (n_fd_passed > 0) {
1219 k = fdset_new_listen_fds(&fds, false);
1221 log_error("Failed to collect file descriptors: %s", strerror(-k));
1225 fdset_close_others(fds);
1228 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1230 log_error("Failed to determine current cgroup: %s", strerror(-k));
1234 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1235 log_error("Failed to allocate cgroup path.");
1239 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1241 log_error("Failed to create cgroup: %s", strerror(-k));
1245 STRV_FOREACH(controller, arg_controllers) {
1246 k = cg_create_and_attach(*controller, newcg, 0);
1248 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1251 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1253 log_error("Failed to acquire pseudo tty: %m");
1257 console = ptsname(master);
1259 log_error("Failed to determine tty name: %m");
1263 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1265 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1266 ioctl(master, TIOCSWINSZ, &ws);
1268 if (unlockpt(master) < 0) {
1269 log_error("Failed to unlock tty: %m");
1273 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1274 saved_attr_valid = true;
1276 raw_attr = saved_attr;
1277 cfmakeraw(&raw_attr);
1278 raw_attr.c_lflag &= ~ECHO;
1281 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1282 log_error("Failed to create kmsg socket pair");
1286 assert_se(sigemptyset(&mask) == 0);
1287 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1288 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1294 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1295 log_error("pipe2(): %m");
1299 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1301 if (errno == EINVAL)
1302 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1304 log_error("clone() failed: %m");
1311 const char *home = NULL;
1312 uid_t uid = (uid_t) -1;
1313 gid_t gid = (gid_t) -1;
1315 const char *envp[] = {
1316 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1317 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1322 NULL, /* container_uuid */
1323 NULL, /* LISTEN_FDS */
1324 NULL, /* LISTEN_PID */
1328 envp[n_env] = strv_find_prefix(environ, "TERM=");
1332 close_nointr_nofail(pipefd[1]);
1333 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1334 close_nointr_nofail(pipefd[0]);
1336 close_nointr_nofail(master);
1339 if (saved_attr_valid) {
1340 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1341 log_error("Failed to set terminal attributes: %m");
1346 close_nointr(STDIN_FILENO);
1347 close_nointr(STDOUT_FILENO);
1348 close_nointr(STDERR_FILENO);
1350 close_nointr_nofail(kmsg_socket_pair[0]);
1351 kmsg_socket_pair[0] = -1;
1353 reset_all_signal_handlers();
1355 assert_se(sigemptyset(&mask) == 0);
1356 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1358 k = open_terminal(console, O_RDWR);
1359 if (k != STDIN_FILENO) {
1361 close_nointr_nofail(k);
1365 log_error("Failed to open console: %s", strerror(-k));
1369 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1370 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1371 log_error("Failed to duplicate console: %m");
1376 log_error("setsid() failed: %m");
1380 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1381 log_error("PR_SET_PDEATHSIG failed: %m");
1385 /* Mark everything as slave, so that we still
1386 * receive mounts from the real root, but don't
1387 * propagate mounts to the real root. */
1388 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1389 log_error("MS_SLAVE|MS_REC failed: %m");
1393 /* Turn directory into bind mount */
1394 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1395 log_error("Failed to make bind mount.");
1400 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1401 log_error("Failed to make read-only.");
1405 if (mount_all(arg_directory) < 0)
1408 if (copy_devnodes(arg_directory) < 0)
1411 if (setup_ptmx(arg_directory) < 0)
1414 dev_setup(arg_directory);
1416 if (setup_dev_console(arg_directory, console) < 0)
1419 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1422 close_nointr_nofail(kmsg_socket_pair[1]);
1423 kmsg_socket_pair[1] = -1;
1425 if (setup_boot_id(arg_directory) < 0)
1428 if (setup_timezone(arg_directory) < 0)
1431 if (setup_resolv_conf(arg_directory) < 0)
1434 if (setup_journal(arg_directory) < 0)
1437 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1440 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1443 if (chdir(arg_directory) < 0) {
1444 log_error("chdir(%s) failed: %m", arg_directory);
1448 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1449 log_error("mount(MS_MOVE) failed: %m");
1453 if (chroot(".") < 0) {
1454 log_error("chroot() failed: %m");
1458 if (chdir("/") < 0) {
1459 log_error("chdir() failed: %m");
1467 if (drop_capabilities() < 0) {
1468 log_error("drop_capabilities() failed: %m");
1474 /* Note that this resolves user names
1475 * inside the container, and hence
1476 * accesses the NSS modules from the
1477 * container and not the host. This is
1480 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1481 log_error("get_user_creds() failed: %m");
1485 if (mkdir_parents_label(home, 0775) < 0) {
1486 log_error("mkdir_parents_label() failed: %m");
1490 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1491 log_error("mkdir_safe_label() failed: %m");
1495 if (initgroups((const char*)arg_user, gid) < 0) {
1496 log_error("initgroups() failed: %m");
1500 if (setresgid(gid, gid, gid) < 0) {
1501 log_error("setregid() failed: %m");
1505 if (setresuid(uid, uid, uid) < 0) {
1506 log_error("setreuid() failed: %m");
1510 /* Reset everything fully to 0, just in case */
1512 if (setgroups(0, NULL) < 0) {
1513 log_error("setgroups() failed: %m");
1517 if (setresgid(0, 0, 0) < 0) {
1518 log_error("setregid() failed: %m");
1522 if (setresuid(0, 0, 0) < 0) {
1523 log_error("setreuid() failed: %m");
1528 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1529 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1530 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1536 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1542 if (fdset_size(fds) > 0) {
1543 k = fdset_cloexec(fds, false);
1545 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1549 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1550 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1562 /* Automatically search for the init system */
1564 l = 1 + argc - optind;
1565 a = newa(char*, l + 1);
1566 memcpy(a + 1, argv + optind, l * sizeof(char*));
1568 a[0] = (char*) "/usr/lib/systemd/systemd";
1569 execve(a[0], a, (char**) envp);
1571 a[0] = (char*) "/lib/systemd/systemd";
1572 execve(a[0], a, (char**) envp);
1574 a[0] = (char*) "/sbin/init";
1575 execve(a[0], a, (char**) envp);
1576 } else if (argc > optind)
1577 execvpe(argv[optind], argv + optind, (char**) envp);
1579 chdir(home ? home : "/root");
1580 execle("/bin/bash", "-bash", NULL, (char**) envp);
1583 log_error("execv() failed: %m");
1586 _exit(EXIT_FAILURE);
1589 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1590 close_nointr_nofail(pipefd[0]);
1591 close_nointr_nofail(pipefd[1]);
1596 if (process_pty(master, pid, &mask) < 0)
1599 if (saved_attr_valid)
1600 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1602 r = wait_for_terminate(pid, &status);
1608 if (status.si_code == CLD_EXITED) {
1609 if (status.si_status != 0) {
1610 log_error("Container failed with error code %i.", status.si_status);
1611 r = status.si_status;
1615 log_debug("Container exited successfully.");
1617 } else if (status.si_code == CLD_KILLED &&
1618 status.si_status == SIGINT) {
1619 log_info("Container has been shut down.");
1622 } else if (status.si_code == CLD_KILLED &&
1623 status.si_status == SIGHUP) {
1624 log_info("Container is being rebooted.");
1626 } else if (status.si_code == CLD_KILLED ||
1627 status.si_code == CLD_DUMPED) {
1629 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1633 log_error("Container failed due to unknown reason.");
1640 if (saved_attr_valid)
1641 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1644 close_nointr_nofail(master);
1646 close_pipe(kmsg_socket_pair);
1649 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1652 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1654 free(arg_directory);
1655 strv_free(arg_controllers);