1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
63 typedef enum LinkJournal {
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
80 (1ULL << CAP_DAC_OVERRIDE) |
81 (1ULL << CAP_DAC_READ_SEARCH) |
82 (1ULL << CAP_FOWNER) |
83 (1ULL << CAP_FSETID) |
84 (1ULL << CAP_IPC_OWNER) |
87 (1ULL << CAP_LINUX_IMMUTABLE) |
88 (1ULL << CAP_NET_BIND_SERVICE) |
89 (1ULL << CAP_NET_BROADCAST) |
90 (1ULL << CAP_NET_RAW) |
91 (1ULL << CAP_SETGID) |
92 (1ULL << CAP_SETFCAP) |
93 (1ULL << CAP_SETPCAP) |
94 (1ULL << CAP_SETUID) |
95 (1ULL << CAP_SYS_ADMIN) |
96 (1ULL << CAP_SYS_CHROOT) |
97 (1ULL << CAP_SYS_NICE) |
98 (1ULL << CAP_SYS_PTRACE) |
99 (1ULL << CAP_SYS_TTY_CONFIG) |
100 (1ULL << CAP_SYS_RESOURCE) |
101 (1ULL << CAP_SYS_BOOT) |
102 (1ULL << CAP_AUDIT_WRITE) |
103 (1ULL << CAP_AUDIT_CONTROL);
105 static int help(void) {
107 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
108 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
109 " -h --help Show this help\n"
110 " --version Print version string\n"
111 " -D --directory=NAME Root directory for the container\n"
112 " -b --boot Boot up full system (i.e. invoke init)\n"
113 " -u --user=USER Run the command under specified user or uid\n"
114 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
115 " --uuid=UUID Set a specific machine UUID for the container\n"
116 " --private-network Disable network in container\n"
117 " --read-only Mount the root directory read-only\n"
118 " --capability=CAP In addition to the default, retain specified capability\n"
119 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
120 " -j Equivalent to --link-journal=host\n",
121 program_invocation_short_name);
126 static int parse_argv(int argc, char *argv[]) {
137 static const struct option options[] = {
138 { "help", no_argument, NULL, 'h' },
139 { "version", no_argument, NULL, ARG_VERSION },
140 { "directory", required_argument, NULL, 'D' },
141 { "user", required_argument, NULL, 'u' },
142 { "controllers", required_argument, NULL, 'C' },
143 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
144 { "boot", no_argument, NULL, 'b' },
145 { "uuid", required_argument, NULL, ARG_UUID },
146 { "read-only", no_argument, NULL, ARG_READ_ONLY },
147 { "capability", required_argument, NULL, ARG_CAPABILITY },
148 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
157 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
166 puts(PACKAGE_STRING);
167 puts(SYSTEMD_FEATURES);
172 arg_directory = canonicalize_file_name(optarg);
173 if (!arg_directory) {
174 log_error("Failed to canonicalize root directory.");
182 if (!(arg_user = strdup(optarg))) {
183 log_error("Failed to duplicate user name.");
190 strv_free(arg_controllers);
191 arg_controllers = strv_split(optarg, ",");
192 if (!arg_controllers) {
193 log_error("Failed to split controllers list.");
196 strv_uniq(arg_controllers);
200 case ARG_PRIVATE_NETWORK:
201 arg_private_network = true;
213 arg_read_only = true;
216 case ARG_CAPABILITY: {
220 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
224 t = strndup(word, length);
228 if (cap_from_name(t, &cap) < 0) {
229 log_error("Failed to parse capability %s.", t);
235 arg_retain |= 1ULL << (uint64_t) cap;
242 arg_link_journal = LINK_GUEST;
245 case ARG_LINK_JOURNAL:
246 if (streq(optarg, "auto"))
247 arg_link_journal = LINK_AUTO;
248 else if (streq(optarg, "no"))
249 arg_link_journal = LINK_NO;
250 else if (streq(optarg, "guest"))
251 arg_link_journal = LINK_GUEST;
252 else if (streq(optarg, "host"))
253 arg_link_journal = LINK_HOST;
255 log_error("Failed to parse link journal mode %s", optarg);
265 log_error("Unknown option code %c", c);
270 if (optind < argc && arg_boot) {
271 log_error("Cannot specify a command together with '-b'");
278 static int mount_all(const char *dest) {
280 typedef struct MountPoint {
289 static const MountPoint mount_table[] = {
290 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
291 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
292 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
293 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
294 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
295 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
296 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
297 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
299 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
300 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
307 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
308 char _cleanup_free_ *where = NULL;
311 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
320 t = path_is_mount_point(where, true);
322 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
330 /* Skip this entry if it is not a remount. */
331 if (mount_table[k].what && t > 0)
334 mkdir_p_label(where, 0755);
336 if (mount(mount_table[k].what,
339 mount_table[k].flags,
340 mount_table[k].options) < 0 &&
341 mount_table[k].fatal) {
343 log_error("mount(%s) failed: %m", where);
353 static int setup_timezone(const char *dest) {
354 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
360 /* Fix the timezone, if possible */
361 r = readlink_malloc("/etc/localtime", &p);
363 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
367 z = path_startswith(p, "../usr/share/zoneinfo/");
369 z = path_startswith(p, "/usr/share/zoneinfo/");
371 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
375 where = strappend(dest, "/etc/localtime");
379 r = readlink_malloc(where, &q);
381 y = path_startswith(q, "../usr/share/zoneinfo/");
383 y = path_startswith(q, "/usr/share/zoneinfo/");
386 /* Already pointing to the right place? Then do nothing .. */
387 if (y && streq(y, z))
391 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
395 if (access(check, F_OK) < 0) {
396 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
400 what = strappend("../usr/share/zoneinfo/", z);
405 if (symlink(what, where) < 0) {
406 log_error("Failed to correct timezone of container: %m");
413 static int setup_resolv_conf(const char *dest) {
418 if (arg_private_network)
421 /* Fix resolv.conf, if possible */
422 where = strappend(dest, "/etc/resolv.conf");
426 /* We don't really care for the results of this really. If it
427 * fails, it fails, but meh... */
428 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
429 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
436 static int setup_boot_id(const char *dest) {
437 char _cleanup_free_ *from = NULL, *to = NULL;
444 /* Generate a new randomized boot ID, so that each boot-up of
445 * the container gets a new one */
447 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
448 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
452 r = sd_id128_randomize(&rnd);
454 log_error("Failed to generate random boot id: %s", strerror(-r));
458 snprintf(as_uuid, sizeof(as_uuid),
459 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
460 SD_ID128_FORMAT_VAL(rnd));
461 char_array_0(as_uuid);
463 r = write_one_line_file(from, as_uuid);
465 log_error("Failed to write boot id: %s", strerror(-r));
469 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
470 log_error("Failed to bind mount boot id: %m");
473 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
479 static int copy_devnodes(const char *dest) {
481 static const char devnodes[] =
492 mode_t _cleanup_umask_ u;
498 NULSTR_FOREACH(d, devnodes) {
500 char _cleanup_free_ *from = NULL, *to = NULL;
502 asprintf(&from, "/dev/%s", d);
503 asprintf(&to, "%s/dev/%s", dest, d);
514 if (stat(from, &st) < 0) {
516 if (errno != ENOENT) {
517 log_error("Failed to stat %s: %m", from);
522 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
524 log_error("%s is not a char or block device, cannot copy", from);
528 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
530 log_error("mknod(%s) failed: %m", dest);
539 static int setup_dev_console(const char *dest, const char *console) {
541 char _cleanup_free_ *to = NULL;
543 mode_t _cleanup_umask_ u;
550 if (stat(console, &st) < 0) {
551 log_error("Failed to stat %s: %m", console);
554 } else if (!S_ISCHR(st.st_mode)) {
555 log_error("/dev/console is not a char device");
559 r = chmod_and_chown(console, 0600, 0, 0);
561 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
565 if (asprintf(&to, "%s/dev/console", dest) < 0)
568 /* We need to bind mount the right tty to /dev/console since
569 * ptys can only exist on pts file systems. To have something
570 * to bind mount things on we create a device node first, that
571 * has the right major/minor (note that the major minor
572 * doesn't actually matter here, since we mount it over
575 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
576 log_error("mknod() for /dev/console failed: %m");
580 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
581 log_error("Bind mount for /dev/console failed: %m");
588 static int setup_kmsg(const char *dest, int kmsg_socket) {
589 char _cleanup_free_ *from = NULL, *to = NULL;
591 mode_t _cleanup_umask_ u;
593 struct cmsghdr cmsghdr;
594 uint8_t buf[CMSG_SPACE(sizeof(int))];
597 struct cmsghdr *cmsg;
600 assert(kmsg_socket >= 0);
604 /* We create the kmsg FIFO as /dev/kmsg, but immediately
605 * delete it after bind mounting it to /proc/kmsg. While FIFOs
606 * on the reading side behave very similar to /proc/kmsg,
607 * their writing side behaves differently from /dev/kmsg in
608 * that writing blocks when nothing is reading. In order to
609 * avoid any problems with containers deadlocking due to this
610 * we simply make /dev/kmsg unavailable to the container. */
611 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
612 asprintf(&to, "%s/proc/kmsg", dest) < 0)
615 if (mkfifo(from, 0600) < 0) {
616 log_error("mkfifo() for /dev/kmsg failed: %m");
620 r = chmod_and_chown(from, 0600, 0, 0);
622 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
626 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
627 log_error("Bind mount for /proc/kmsg failed: %m");
631 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
633 log_error("Failed to open fifo: %m");
640 mh.msg_control = &control;
641 mh.msg_controllen = sizeof(control);
643 cmsg = CMSG_FIRSTHDR(&mh);
644 cmsg->cmsg_level = SOL_SOCKET;
645 cmsg->cmsg_type = SCM_RIGHTS;
646 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
647 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
649 mh.msg_controllen = cmsg->cmsg_len;
651 /* Store away the fd in the socket, so that it stays open as
652 * long as we run the child */
653 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
654 close_nointr_nofail(fd);
657 log_error("Failed to send FIFO fd: %m");
661 /* And now make the FIFO unavailable as /dev/kmsg... */
666 static int setup_hostname(void) {
670 hn = path_get_file_name(arg_directory);
676 hostname_cleanup(hn);
679 if (sethostname(hn, strlen(hn)) < 0)
688 static int setup_journal(const char *directory) {
689 sd_id128_t machine_id;
690 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
694 if (arg_link_journal == LINK_NO)
697 p = strappend(directory, "/etc/machine-id");
701 r = read_one_line_file(p, &b);
702 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
705 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
710 if (isempty(id) && arg_link_journal == LINK_AUTO)
713 /* Verify validity */
714 r = sd_id128_from_string(id, &machine_id);
716 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
721 p = strappend("/var/log/journal/", id);
722 q = strjoin(directory, "/var/log/journal/", id, NULL);
726 if (path_is_mount_point(p, false) > 0) {
727 if (arg_link_journal != LINK_AUTO) {
728 log_error("%s: already a mount point, refusing to use for journal", p);
735 if (path_is_mount_point(q, false) > 0) {
736 if (arg_link_journal != LINK_AUTO) {
737 log_error("%s: already a mount point, refusing to use for journal", q);
744 r = readlink_and_make_absolute(p, &d);
746 if ((arg_link_journal == LINK_GUEST ||
747 arg_link_journal == LINK_AUTO) &&
750 r = mkdir_p(q, 0755);
752 log_warning("failed to create directory %s: %m", q);
757 log_error("Failed to remove symlink %s: %m", p);
760 } else if (r == -EINVAL) {
762 if (arg_link_journal == LINK_GUEST &&
765 if (errno == ENOTDIR) {
766 log_error("%s already exists and is neither a symlink nor a directory", p);
769 log_error("Failed to remove %s: %m", p);
773 } else if (r != -ENOENT) {
774 log_error("readlink(%s) failed: %m", p);
778 if (arg_link_journal == LINK_GUEST) {
780 if (symlink(q, p) < 0) {
781 log_error("Failed to symlink %s to %s: %m", q, p);
785 r = mkdir_p(q, 0755);
787 log_warning("failed to create directory %s: %m", q);
791 if (arg_link_journal == LINK_HOST) {
792 r = mkdir_p(p, 0755);
794 log_error("Failed to create %s: %m", p);
798 } else if (access(p, F_OK) < 0)
801 if (dir_is_empty(q) == 0) {
802 log_error("%s not empty.", q);
806 r = mkdir_p(q, 0755);
808 log_error("Failed to create %s: %m", q);
812 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
813 log_error("Failed to bind mount journal from host into guest: %m");
820 static int drop_capabilities(void) {
821 return capability_bounding_set_drop(~arg_retain, false);
824 static int is_os_tree(const char *path) {
827 /* We use /bin/sh as flag file if something is an OS */
829 if (asprintf(&p, "%s/bin/sh", path) < 0)
835 return r < 0 ? 0 : 1;
838 static int process_pty(int master, pid_t pid, sigset_t *mask) {
840 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
841 size_t in_buffer_full = 0, out_buffer_full = 0;
842 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
843 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
844 int ep = -1, signal_fd = -1, r;
845 bool tried_orderly_shutdown = false;
851 fd_nonblock(STDIN_FILENO, 1);
852 fd_nonblock(STDOUT_FILENO, 1);
853 fd_nonblock(master, 1);
855 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
857 log_error("signalfd(): %m");
862 ep = epoll_create1(EPOLL_CLOEXEC);
864 log_error("Failed to create epoll: %m");
869 /* We read from STDIN only if this is actually a TTY,
870 * otherwise we assume non-interactivity. */
871 if (isatty(STDIN_FILENO)) {
873 stdin_ev.events = EPOLLIN|EPOLLET;
874 stdin_ev.data.fd = STDIN_FILENO;
876 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
877 log_error("Failed to register STDIN in epoll: %m");
884 stdout_ev.events = EPOLLOUT|EPOLLET;
885 stdout_ev.data.fd = STDOUT_FILENO;
888 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
889 master_ev.data.fd = master;
892 signal_ev.events = EPOLLIN;
893 signal_ev.data.fd = signal_fd;
895 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
896 if (errno != EPERM) {
897 log_error("Failed to register stdout in epoll: %m");
901 /* stdout without epoll support. Likely redirected to regular file. */
902 stdout_writable = true;
905 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
906 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
907 log_error("Failed to register fds in epoll: %m");
913 struct epoll_event ev[16];
917 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
920 if (errno == EINTR || errno == EAGAIN)
923 log_error("epoll_wait(): %m");
930 for (i = 0; i < nfds; i++) {
931 if (ev[i].data.fd == STDIN_FILENO) {
933 if (ev[i].events & (EPOLLIN|EPOLLHUP))
934 stdin_readable = true;
936 } else if (ev[i].data.fd == STDOUT_FILENO) {
938 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
939 stdout_writable = true;
941 } else if (ev[i].data.fd == master) {
943 if (ev[i].events & (EPOLLIN|EPOLLHUP))
944 master_readable = true;
946 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
947 master_writable = true;
949 } else if (ev[i].data.fd == signal_fd) {
950 struct signalfd_siginfo sfsi;
953 n = read(signal_fd, &sfsi, sizeof(sfsi));
954 if (n != sizeof(sfsi)) {
957 log_error("Failed to read from signalfd: invalid block size");
962 if (errno != EINTR && errno != EAGAIN) {
963 log_error("Failed to read from signalfd: %m");
969 if (sfsi.ssi_signo == SIGWINCH) {
972 /* The window size changed, let's forward that. */
973 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
974 ioctl(master, TIOCSWINSZ, &ws);
975 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
977 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
979 /* This only works for systemd... */
980 tried_orderly_shutdown = true;
981 kill(pid, SIGRTMIN+3);
991 while ((stdin_readable && in_buffer_full <= 0) ||
992 (master_writable && in_buffer_full > 0) ||
993 (master_readable && out_buffer_full <= 0) ||
994 (stdout_writable && out_buffer_full > 0)) {
996 if (stdin_readable && in_buffer_full < LINE_MAX) {
998 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1001 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1002 stdin_readable = false;
1004 log_error("read(): %m");
1009 in_buffer_full += (size_t) k;
1012 if (master_writable && in_buffer_full > 0) {
1014 k = write(master, in_buffer, in_buffer_full);
1017 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1018 master_writable = false;
1020 log_error("write(): %m");
1026 assert(in_buffer_full >= (size_t) k);
1027 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1028 in_buffer_full -= k;
1032 if (master_readable && out_buffer_full < LINE_MAX) {
1034 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1037 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1038 master_readable = false;
1040 log_error("read(): %m");
1045 out_buffer_full += (size_t) k;
1048 if (stdout_writable && out_buffer_full > 0) {
1050 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1053 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1054 stdout_writable = false;
1056 log_error("write(): %m");
1062 assert(out_buffer_full >= (size_t) k);
1063 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1064 out_buffer_full -= k;
1072 close_nointr_nofail(ep);
1075 close_nointr_nofail(signal_fd);
1080 int main(int argc, char *argv[]) {
1082 int r = EXIT_FAILURE, k;
1083 char *oldcg = NULL, *newcg = NULL;
1084 char **controller = NULL;
1085 int master = -1, n_fd_passed;
1086 const char *console = NULL;
1087 struct termios saved_attr, raw_attr;
1089 bool saved_attr_valid = false;
1091 int kmsg_socket_pair[2] = { -1, -1 };
1094 log_parse_environment();
1097 r = parse_argv(argc, argv);
1101 if (arg_directory) {
1104 p = path_make_absolute_cwd(arg_directory);
1105 free(arg_directory);
1108 arg_directory = get_current_dir_name();
1110 if (!arg_directory) {
1111 log_error("Failed to determine path");
1115 path_kill_slashes(arg_directory);
1117 if (geteuid() != 0) {
1118 log_error("Need to be root.");
1122 if (sd_booted() <= 0) {
1123 log_error("Not running on a systemd system.");
1127 if (path_equal(arg_directory, "/")) {
1128 log_error("Spawning container on root directory not supported.");
1132 if (is_os_tree(arg_directory) <= 0) {
1133 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1138 n_fd_passed = sd_listen_fds(false);
1139 if (n_fd_passed > 0) {
1140 k = fdset_new_listen_fds(&fds, false);
1142 log_error("Failed to collect file descriptors: %s", strerror(-k));
1146 fdset_close_others(fds);
1149 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1151 log_error("Failed to determine current cgroup: %s", strerror(-k));
1155 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1156 log_error("Failed to allocate cgroup path.");
1160 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1162 log_error("Failed to create cgroup: %s", strerror(-k));
1166 STRV_FOREACH(controller, arg_controllers) {
1167 k = cg_create_and_attach(*controller, newcg, 0);
1169 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1172 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1174 log_error("Failed to acquire pseudo tty: %m");
1178 console = ptsname(master);
1180 log_error("Failed to determine tty name: %m");
1184 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1186 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1187 ioctl(master, TIOCSWINSZ, &ws);
1189 if (unlockpt(master) < 0) {
1190 log_error("Failed to unlock tty: %m");
1194 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1195 saved_attr_valid = true;
1197 raw_attr = saved_attr;
1198 cfmakeraw(&raw_attr);
1199 raw_attr.c_lflag &= ~ECHO;
1202 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1203 log_error("Failed to create kmsg socket pair");
1207 assert_se(sigemptyset(&mask) == 0);
1208 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1209 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1215 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1216 log_error("pipe2(): %m");
1220 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1222 if (errno == EINVAL)
1223 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1225 log_error("clone() failed: %m");
1232 const char *home = NULL;
1233 uid_t uid = (uid_t) -1;
1234 gid_t gid = (gid_t) -1;
1236 const char *envp[] = {
1237 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1238 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1243 NULL, /* container_uuid */
1244 NULL, /* LISTEN_FDS */
1245 NULL, /* LISTEN_PID */
1249 envp[2] = strv_find_prefix(environ, "TERM=");
1252 close_nointr_nofail(pipefd[1]);
1253 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1254 close_nointr_nofail(pipefd[0]);
1256 close_nointr_nofail(master);
1259 if (saved_attr_valid) {
1260 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1261 log_error("Failed to set terminal attributes: %m");
1266 close_nointr(STDIN_FILENO);
1267 close_nointr(STDOUT_FILENO);
1268 close_nointr(STDERR_FILENO);
1270 close_nointr_nofail(kmsg_socket_pair[0]);
1271 kmsg_socket_pair[0] = -1;
1273 reset_all_signal_handlers();
1275 assert_se(sigemptyset(&mask) == 0);
1276 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1278 k = open_terminal(console, O_RDWR);
1279 if (k != STDIN_FILENO) {
1281 close_nointr_nofail(k);
1285 log_error("Failed to open console: %s", strerror(-k));
1289 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1290 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1291 log_error("Failed to duplicate console: %m");
1296 log_error("setsid() failed: %m");
1300 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1301 log_error("PR_SET_PDEATHSIG failed: %m");
1305 /* Mark everything as slave, so that we still
1306 * receive mounts from the real root, but don't
1307 * propagate mounts to the real root. */
1308 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1309 log_error("MS_SLAVE|MS_REC failed: %m");
1313 /* Turn directory into bind mount */
1314 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1315 log_error("Failed to make bind mount.");
1320 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1321 log_error("Failed to make read-only.");
1325 if (mount_all(arg_directory) < 0)
1328 if (copy_devnodes(arg_directory) < 0)
1331 dev_setup(arg_directory);
1333 if (setup_dev_console(arg_directory, console) < 0)
1336 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1339 close_nointr_nofail(kmsg_socket_pair[1]);
1340 kmsg_socket_pair[1] = -1;
1342 if (setup_boot_id(arg_directory) < 0)
1345 if (setup_timezone(arg_directory) < 0)
1348 if (setup_resolv_conf(arg_directory) < 0)
1351 if (setup_journal(arg_directory) < 0)
1354 if (chdir(arg_directory) < 0) {
1355 log_error("chdir(%s) failed: %m", arg_directory);
1359 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1360 log_error("mount(MS_MOVE) failed: %m");
1364 if (chroot(".") < 0) {
1365 log_error("chroot() failed: %m");
1369 if (chdir("/") < 0) {
1370 log_error("chdir() failed: %m");
1378 if (drop_capabilities() < 0) {
1379 log_error("drop_capabilities() failed: %m");
1385 /* Note that this resolves user names
1386 * inside the container, and hence
1387 * accesses the NSS modules from the
1388 * container and not the host. This is
1391 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1392 log_error("get_user_creds() failed: %m");
1396 if (mkdir_parents_label(home, 0775) < 0) {
1397 log_error("mkdir_parents_label() failed: %m");
1401 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1402 log_error("mkdir_safe_label() failed: %m");
1406 if (initgroups((const char*)arg_user, gid) < 0) {
1407 log_error("initgroups() failed: %m");
1411 if (setresgid(gid, gid, gid) < 0) {
1412 log_error("setregid() failed: %m");
1416 if (setresuid(uid, uid, uid) < 0) {
1417 log_error("setreuid() failed: %m");
1421 /* Reset everything fully to 0, just in case */
1423 if (setgroups(0, NULL) < 0) {
1424 log_error("setgroups() failed: %m");
1428 if (setresgid(0, 0, 0) < 0) {
1429 log_error("setregid() failed: %m");
1433 if (setresuid(0, 0, 0) < 0) {
1434 log_error("setreuid() failed: %m");
1439 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1440 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1441 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1447 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1453 if (fdset_size(fds) > 0) {
1454 k = fdset_cloexec(fds, false);
1456 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1460 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1461 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1473 /* Automatically search for the init system */
1475 l = 1 + argc - optind;
1476 a = newa(char*, l + 1);
1477 memcpy(a + 1, argv + optind, l * sizeof(char*));
1479 a[0] = (char*) "/usr/lib/systemd/systemd";
1480 execve(a[0], a, (char**) envp);
1482 a[0] = (char*) "/lib/systemd/systemd";
1483 execve(a[0], a, (char**) envp);
1485 a[0] = (char*) "/sbin/init";
1486 execve(a[0], a, (char**) envp);
1487 } else if (argc > optind)
1488 execvpe(argv[optind], argv + optind, (char**) envp);
1490 chdir(home ? home : "/root");
1491 execle("/bin/bash", "-bash", NULL, (char**) envp);
1494 log_error("execv() failed: %m");
1497 _exit(EXIT_FAILURE);
1500 log_info("Init process in the container running as PID %d", pid);
1501 close_nointr_nofail(pipefd[0]);
1502 close_nointr_nofail(pipefd[1]);
1507 if (process_pty(master, pid, &mask) < 0)
1510 if (saved_attr_valid)
1511 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1513 r = wait_for_terminate(pid, &status);
1519 if (status.si_code == CLD_EXITED) {
1520 if (status.si_status != 0) {
1521 log_error("Container failed with error code %i.", status.si_status);
1522 r = status.si_status;
1526 log_debug("Container exited successfully.");
1528 } else if (status.si_code == CLD_KILLED &&
1529 status.si_status == SIGINT) {
1530 log_info("Container has been shut down.");
1533 } else if (status.si_code == CLD_KILLED &&
1534 status.si_status == SIGHUP) {
1535 log_info("Container is being rebooted.");
1537 } else if (status.si_code == CLD_KILLED ||
1538 status.si_code == CLD_DUMPED) {
1540 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1544 log_error("Container failed due to unknown reason.");
1551 if (saved_attr_valid)
1552 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1555 close_nointr_nofail(master);
1557 close_pipe(kmsg_socket_pair);
1560 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1563 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1565 free(arg_directory);
1566 strv_free(arg_controllers);