1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
63 typedef enum LinkJournal {
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
80 (1ULL << CAP_DAC_OVERRIDE) |
81 (1ULL << CAP_DAC_READ_SEARCH) |
82 (1ULL << CAP_FOWNER) |
83 (1ULL << CAP_FSETID) |
84 (1ULL << CAP_IPC_OWNER) |
87 (1ULL << CAP_LINUX_IMMUTABLE) |
88 (1ULL << CAP_NET_BIND_SERVICE) |
89 (1ULL << CAP_NET_BROADCAST) |
90 (1ULL << CAP_NET_RAW) |
91 (1ULL << CAP_SETGID) |
92 (1ULL << CAP_SETFCAP) |
93 (1ULL << CAP_SETPCAP) |
94 (1ULL << CAP_SETUID) |
95 (1ULL << CAP_SYS_ADMIN) |
96 (1ULL << CAP_SYS_CHROOT) |
97 (1ULL << CAP_SYS_NICE) |
98 (1ULL << CAP_SYS_PTRACE) |
99 (1ULL << CAP_SYS_TTY_CONFIG) |
100 (1ULL << CAP_SYS_RESOURCE) |
101 (1ULL << CAP_SYS_BOOT) |
102 (1ULL << CAP_AUDIT_WRITE) |
103 (1ULL << CAP_AUDIT_CONTROL);
105 static int help(void) {
107 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
108 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
109 " -h --help Show this help\n"
110 " --version Print version string\n"
111 " -D --directory=NAME Root directory for the container\n"
112 " -b --boot Boot up full system (i.e. invoke init)\n"
113 " -u --user=USER Run the command under specified user or uid\n"
114 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
115 " --uuid=UUID Set a specific machine UUID for the container\n"
116 " --private-network Disable network in container\n"
117 " --read-only Mount the root directory read-only\n"
118 " --capability=CAP In addition to the default, retain specified capability\n"
119 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
120 " -j Equivalent to --link-journal=host\n",
121 program_invocation_short_name);
126 static int parse_argv(int argc, char *argv[]) {
137 static const struct option options[] = {
138 { "help", no_argument, NULL, 'h' },
139 { "version", no_argument, NULL, ARG_VERSION },
140 { "directory", required_argument, NULL, 'D' },
141 { "user", required_argument, NULL, 'u' },
142 { "controllers", required_argument, NULL, 'C' },
143 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
144 { "boot", no_argument, NULL, 'b' },
145 { "uuid", required_argument, NULL, ARG_UUID },
146 { "read-only", no_argument, NULL, ARG_READ_ONLY },
147 { "capability", required_argument, NULL, ARG_CAPABILITY },
148 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
157 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
166 puts(PACKAGE_STRING);
167 puts(SYSTEMD_FEATURES);
172 arg_directory = canonicalize_file_name(optarg);
173 if (!arg_directory) {
174 log_error("Failed to canonicalize root directory.");
182 if (!(arg_user = strdup(optarg))) {
183 log_error("Failed to duplicate user name.");
190 strv_free(arg_controllers);
191 arg_controllers = strv_split(optarg, ",");
192 if (!arg_controllers) {
193 log_error("Failed to split controllers list.");
196 strv_uniq(arg_controllers);
200 case ARG_PRIVATE_NETWORK:
201 arg_private_network = true;
213 arg_read_only = true;
216 case ARG_CAPABILITY: {
220 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
224 t = strndup(word, length);
228 if (cap_from_name(t, &cap) < 0) {
229 log_error("Failed to parse capability %s.", t);
235 arg_retain |= 1ULL << (uint64_t) cap;
242 arg_link_journal = LINK_GUEST;
245 case ARG_LINK_JOURNAL:
246 if (streq(optarg, "auto"))
247 arg_link_journal = LINK_AUTO;
248 else if (streq(optarg, "no"))
249 arg_link_journal = LINK_NO;
250 else if (streq(optarg, "guest"))
251 arg_link_journal = LINK_GUEST;
252 else if (streq(optarg, "host"))
253 arg_link_journal = LINK_HOST;
255 log_error("Failed to parse link journal mode %s", optarg);
265 log_error("Unknown option code %c", c);
273 static int mount_all(const char *dest) {
275 typedef struct MountPoint {
284 static const MountPoint mount_table[] = {
285 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
286 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
287 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
288 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
289 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
290 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
291 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
292 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
294 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
295 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
302 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
303 char _cleanup_free_ *where = NULL;
306 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
315 t = path_is_mount_point(where, true);
317 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
325 /* Skip this entry if it is not a remount. */
326 if (mount_table[k].what && t > 0)
329 mkdir_p_label(where, 0755);
331 if (mount(mount_table[k].what,
334 mount_table[k].flags,
335 mount_table[k].options) < 0 &&
336 mount_table[k].fatal) {
338 log_error("mount(%s) failed: %m", where);
348 static int setup_timezone(const char *dest) {
349 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
355 /* Fix the timezone, if possible */
356 r = readlink_malloc("/etc/localtime", &p);
358 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
362 z = path_startswith(p, "../usr/share/zoneinfo/");
364 z = path_startswith(p, "/usr/share/zoneinfo/");
366 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
370 where = strappend(dest, "/etc/localtime");
374 r = readlink_malloc(where, &q);
376 y = path_startswith(q, "../usr/share/zoneinfo/");
378 y = path_startswith(q, "/usr/share/zoneinfo/");
381 /* Already pointing to the right place? Then do nothing .. */
382 if (y && streq(y, z))
386 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
390 if (access(check, F_OK) < 0) {
391 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
395 what = strappend("../usr/share/zoneinfo/", z);
400 if (symlink(what, where) < 0) {
401 log_error("Failed to correct timezone of container: %m");
408 static int setup_resolv_conf(const char *dest) {
413 if (arg_private_network)
416 /* Fix resolv.conf, if possible */
417 where = strappend(dest, "/etc/resolv.conf");
421 /* We don't really care for the results of this really. If it
422 * fails, it fails, but meh... */
423 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
424 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
431 static int setup_boot_id(const char *dest) {
432 char _cleanup_free_ *from = NULL, *to = NULL;
439 /* Generate a new randomized boot ID, so that each boot-up of
440 * the container gets a new one */
442 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
443 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
447 r = sd_id128_randomize(&rnd);
449 log_error("Failed to generate random boot id: %s", strerror(-r));
453 snprintf(as_uuid, sizeof(as_uuid),
454 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
455 SD_ID128_FORMAT_VAL(rnd));
456 char_array_0(as_uuid);
458 r = write_one_line_file(from, as_uuid);
460 log_error("Failed to write boot id: %s", strerror(-r));
464 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
465 log_error("Failed to bind mount boot id: %m");
468 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
474 static int copy_devnodes(const char *dest) {
476 static const char devnodes[] =
487 mode_t _cleanup_umask_ u;
493 NULSTR_FOREACH(d, devnodes) {
495 char _cleanup_free_ *from = NULL, *to = NULL;
497 asprintf(&from, "/dev/%s", d);
498 asprintf(&to, "%s/dev/%s", dest, d);
509 if (stat(from, &st) < 0) {
511 if (errno != ENOENT) {
512 log_error("Failed to stat %s: %m", from);
517 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
519 log_error("%s is not a char or block device, cannot copy", from);
523 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
525 log_error("mknod(%s) failed: %m", dest);
534 static int setup_dev_console(const char *dest, const char *console) {
536 char _cleanup_free_ *to = NULL;
538 mode_t _cleanup_umask_ u;
545 if (stat(console, &st) < 0) {
546 log_error("Failed to stat %s: %m", console);
549 } else if (!S_ISCHR(st.st_mode)) {
550 log_error("/dev/console is not a char device");
554 r = chmod_and_chown(console, 0600, 0, 0);
556 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
560 if (asprintf(&to, "%s/dev/console", dest) < 0)
563 /* We need to bind mount the right tty to /dev/console since
564 * ptys can only exist on pts file systems. To have something
565 * to bind mount things on we create a device node first, that
566 * has the right major/minor (note that the major minor
567 * doesn't actually matter here, since we mount it over
570 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
571 log_error("mknod() for /dev/console failed: %m");
575 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
576 log_error("Bind mount for /dev/console failed: %m");
583 static int setup_kmsg(const char *dest, int kmsg_socket) {
584 char _cleanup_free_ *from = NULL, *to = NULL;
586 mode_t _cleanup_umask_ u;
588 struct cmsghdr cmsghdr;
589 uint8_t buf[CMSG_SPACE(sizeof(int))];
592 struct cmsghdr *cmsg;
595 assert(kmsg_socket >= 0);
599 /* We create the kmsg FIFO as /dev/kmsg, but immediately
600 * delete it after bind mounting it to /proc/kmsg. While FIFOs
601 * on the reading side behave very similar to /proc/kmsg,
602 * their writing side behaves differently from /dev/kmsg in
603 * that writing blocks when nothing is reading. In order to
604 * avoid any problems with containers deadlocking due to this
605 * we simply make /dev/kmsg unavailable to the container. */
606 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
607 asprintf(&to, "%s/proc/kmsg", dest) < 0)
610 if (mkfifo(from, 0600) < 0) {
611 log_error("mkfifo() for /dev/kmsg failed: %m");
615 r = chmod_and_chown(from, 0600, 0, 0);
617 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
621 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
622 log_error("Bind mount for /proc/kmsg failed: %m");
626 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
628 log_error("Failed to open fifo: %m");
635 mh.msg_control = &control;
636 mh.msg_controllen = sizeof(control);
638 cmsg = CMSG_FIRSTHDR(&mh);
639 cmsg->cmsg_level = SOL_SOCKET;
640 cmsg->cmsg_type = SCM_RIGHTS;
641 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
642 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
644 mh.msg_controllen = cmsg->cmsg_len;
646 /* Store away the fd in the socket, so that it stays open as
647 * long as we run the child */
648 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
649 close_nointr_nofail(fd);
652 log_error("Failed to send FIFO fd: %m");
656 /* And now make the FIFO unavailable as /dev/kmsg... */
661 static int setup_hostname(void) {
665 hn = path_get_file_name(arg_directory);
671 hostname_cleanup(hn);
674 if (sethostname(hn, strlen(hn)) < 0)
683 static int setup_journal(const char *directory) {
684 sd_id128_t machine_id;
685 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
689 if (arg_link_journal == LINK_NO)
692 p = strappend(directory, "/etc/machine-id");
696 r = read_one_line_file(p, &b);
697 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
700 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
705 if (isempty(id) && arg_link_journal == LINK_AUTO)
708 /* Verify validity */
709 r = sd_id128_from_string(id, &machine_id);
711 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
716 p = strappend("/var/log/journal/", id);
717 q = strjoin(directory, "/var/log/journal/", id, NULL);
721 if (path_is_mount_point(p, false) > 0) {
722 if (arg_link_journal != LINK_AUTO) {
723 log_error("%s: already a mount point, refusing to use for journal", p);
730 if (path_is_mount_point(q, false) > 0) {
731 if (arg_link_journal != LINK_AUTO) {
732 log_error("%s: already a mount point, refusing to use for journal", q);
739 r = readlink_and_make_absolute(p, &d);
741 if ((arg_link_journal == LINK_GUEST ||
742 arg_link_journal == LINK_AUTO) &&
745 r = mkdir_p(q, 0755);
747 log_warning("failed to create directory %s: %m", q);
752 log_error("Failed to remove symlink %s: %m", p);
755 } else if (r == -EINVAL) {
757 if (arg_link_journal == LINK_GUEST &&
760 if (errno == ENOTDIR) {
761 log_error("%s already exists and is neither a symlink nor a directory", p);
764 log_error("Failed to remove %s: %m", p);
768 } else if (r != -ENOENT) {
769 log_error("readlink(%s) failed: %m", p);
773 if (arg_link_journal == LINK_GUEST) {
775 if (symlink(q, p) < 0) {
776 log_error("Failed to symlink %s to %s: %m", q, p);
780 r = mkdir_p(q, 0755);
782 log_warning("failed to create directory %s: %m", q);
786 if (arg_link_journal == LINK_HOST) {
787 r = mkdir_p(p, 0755);
789 log_error("Failed to create %s: %m", p);
793 } else if (access(p, F_OK) < 0)
796 if (dir_is_empty(q) == 0) {
797 log_error("%s not empty.", q);
801 r = mkdir_p(q, 0755);
803 log_error("Failed to create %s: %m", q);
807 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
808 log_error("Failed to bind mount journal from host into guest: %m");
815 static int drop_capabilities(void) {
816 return capability_bounding_set_drop(~arg_retain, false);
819 static int is_os_tree(const char *path) {
822 /* We use /bin/sh as flag file if something is an OS */
824 if (asprintf(&p, "%s/bin/sh", path) < 0)
830 return r < 0 ? 0 : 1;
833 static int process_pty(int master, pid_t pid, sigset_t *mask) {
835 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
836 size_t in_buffer_full = 0, out_buffer_full = 0;
837 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
838 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
839 int ep = -1, signal_fd = -1, r;
840 bool tried_orderly_shutdown = false;
846 fd_nonblock(STDIN_FILENO, 1);
847 fd_nonblock(STDOUT_FILENO, 1);
848 fd_nonblock(master, 1);
850 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
852 log_error("signalfd(): %m");
857 ep = epoll_create1(EPOLL_CLOEXEC);
859 log_error("Failed to create epoll: %m");
864 /* We read from STDIN only if this is actually a TTY,
865 * otherwise we assume non-interactivity. */
866 if (isatty(STDIN_FILENO)) {
868 stdin_ev.events = EPOLLIN|EPOLLET;
869 stdin_ev.data.fd = STDIN_FILENO;
871 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
872 log_error("Failed to register STDIN in epoll: %m");
879 stdout_ev.events = EPOLLOUT|EPOLLET;
880 stdout_ev.data.fd = STDOUT_FILENO;
883 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
884 master_ev.data.fd = master;
887 signal_ev.events = EPOLLIN;
888 signal_ev.data.fd = signal_fd;
890 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
891 if (errno != EPERM) {
892 log_error("Failed to register stdout in epoll: %m");
896 /* stdout without epoll support. Likely redirected to regular file. */
897 stdout_writable = true;
900 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
901 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
902 log_error("Failed to register fds in epoll: %m");
908 struct epoll_event ev[16];
912 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
915 if (errno == EINTR || errno == EAGAIN)
918 log_error("epoll_wait(): %m");
925 for (i = 0; i < nfds; i++) {
926 if (ev[i].data.fd == STDIN_FILENO) {
928 if (ev[i].events & (EPOLLIN|EPOLLHUP))
929 stdin_readable = true;
931 } else if (ev[i].data.fd == STDOUT_FILENO) {
933 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
934 stdout_writable = true;
936 } else if (ev[i].data.fd == master) {
938 if (ev[i].events & (EPOLLIN|EPOLLHUP))
939 master_readable = true;
941 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
942 master_writable = true;
944 } else if (ev[i].data.fd == signal_fd) {
945 struct signalfd_siginfo sfsi;
948 n = read(signal_fd, &sfsi, sizeof(sfsi));
949 if (n != sizeof(sfsi)) {
952 log_error("Failed to read from signalfd: invalid block size");
957 if (errno != EINTR && errno != EAGAIN) {
958 log_error("Failed to read from signalfd: %m");
964 if (sfsi.ssi_signo == SIGWINCH) {
967 /* The window size changed, let's forward that. */
968 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
969 ioctl(master, TIOCSWINSZ, &ws);
970 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
972 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
974 /* This only works for systemd... */
975 tried_orderly_shutdown = true;
976 kill(pid, SIGRTMIN+3);
986 while ((stdin_readable && in_buffer_full <= 0) ||
987 (master_writable && in_buffer_full > 0) ||
988 (master_readable && out_buffer_full <= 0) ||
989 (stdout_writable && out_buffer_full > 0)) {
991 if (stdin_readable && in_buffer_full < LINE_MAX) {
993 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
996 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
997 stdin_readable = false;
999 log_error("read(): %m");
1004 in_buffer_full += (size_t) k;
1007 if (master_writable && in_buffer_full > 0) {
1009 k = write(master, in_buffer, in_buffer_full);
1012 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1013 master_writable = false;
1015 log_error("write(): %m");
1021 assert(in_buffer_full >= (size_t) k);
1022 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1023 in_buffer_full -= k;
1027 if (master_readable && out_buffer_full < LINE_MAX) {
1029 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1032 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1033 master_readable = false;
1035 log_error("read(): %m");
1040 out_buffer_full += (size_t) k;
1043 if (stdout_writable && out_buffer_full > 0) {
1045 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1048 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1049 stdout_writable = false;
1051 log_error("write(): %m");
1057 assert(out_buffer_full >= (size_t) k);
1058 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1059 out_buffer_full -= k;
1067 close_nointr_nofail(ep);
1070 close_nointr_nofail(signal_fd);
1075 int main(int argc, char *argv[]) {
1077 int r = EXIT_FAILURE, k;
1078 char *oldcg = NULL, *newcg = NULL;
1079 char **controller = NULL;
1080 int master = -1, n_fd_passed;
1081 const char *console = NULL;
1082 struct termios saved_attr, raw_attr;
1084 bool saved_attr_valid = false;
1086 int kmsg_socket_pair[2] = { -1, -1 };
1089 log_parse_environment();
1092 r = parse_argv(argc, argv);
1096 if (arg_directory) {
1099 p = path_make_absolute_cwd(arg_directory);
1100 free(arg_directory);
1103 arg_directory = get_current_dir_name();
1105 if (!arg_directory) {
1106 log_error("Failed to determine path");
1110 path_kill_slashes(arg_directory);
1112 if (geteuid() != 0) {
1113 log_error("Need to be root.");
1117 if (sd_booted() <= 0) {
1118 log_error("Not running on a systemd system.");
1122 if (path_equal(arg_directory, "/")) {
1123 log_error("Spawning container on root directory not supported.");
1127 if (is_os_tree(arg_directory) <= 0) {
1128 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1133 n_fd_passed = sd_listen_fds(false);
1134 if (n_fd_passed > 0) {
1135 k = fdset_new_listen_fds(&fds, false);
1137 log_error("Failed to collect file descriptors: %s", strerror(-k));
1141 fdset_close_others(fds);
1144 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1146 log_error("Failed to determine current cgroup: %s", strerror(-k));
1150 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1151 log_error("Failed to allocate cgroup path.");
1155 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1157 log_error("Failed to create cgroup: %s", strerror(-k));
1161 STRV_FOREACH(controller, arg_controllers) {
1162 k = cg_create_and_attach(*controller, newcg, 0);
1164 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1167 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1169 log_error("Failed to acquire pseudo tty: %m");
1173 console = ptsname(master);
1175 log_error("Failed to determine tty name: %m");
1179 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1181 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1182 ioctl(master, TIOCSWINSZ, &ws);
1184 if (unlockpt(master) < 0) {
1185 log_error("Failed to unlock tty: %m");
1189 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1190 saved_attr_valid = true;
1192 raw_attr = saved_attr;
1193 cfmakeraw(&raw_attr);
1194 raw_attr.c_lflag &= ~ECHO;
1197 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1198 log_error("Failed to create kmsg socket pair");
1202 assert_se(sigemptyset(&mask) == 0);
1203 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1204 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1210 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1211 log_error("pipe2(): %m");
1215 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1217 if (errno == EINVAL)
1218 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1220 log_error("clone() failed: %m");
1227 const char *home = NULL;
1228 uid_t uid = (uid_t) -1;
1229 gid_t gid = (gid_t) -1;
1231 const char *envp[] = {
1232 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1233 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1238 NULL, /* container_uuid */
1239 NULL, /* LISTEN_FDS */
1240 NULL, /* LISTEN_PID */
1244 envp[2] = strv_find_prefix(environ, "TERM=");
1247 close_nointr_nofail(pipefd[1]);
1248 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1249 close_nointr_nofail(pipefd[0]);
1251 close_nointr_nofail(master);
1254 if (saved_attr_valid) {
1255 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1256 log_error("Failed to set terminal attributes: %m");
1261 close_nointr(STDIN_FILENO);
1262 close_nointr(STDOUT_FILENO);
1263 close_nointr(STDERR_FILENO);
1265 close_nointr_nofail(kmsg_socket_pair[0]);
1266 kmsg_socket_pair[0] = -1;
1268 reset_all_signal_handlers();
1270 assert_se(sigemptyset(&mask) == 0);
1271 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1273 k = open_terminal(console, O_RDWR);
1274 if (k != STDIN_FILENO) {
1276 close_nointr_nofail(k);
1280 log_error("Failed to open console: %s", strerror(-k));
1284 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1285 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1286 log_error("Failed to duplicate console: %m");
1291 log_error("setsid() failed: %m");
1295 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1296 log_error("PR_SET_PDEATHSIG failed: %m");
1300 /* Mark everything as slave, so that we still
1301 * receive mounts from the real root, but don't
1302 * propagate mounts to the real root. */
1303 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1304 log_error("MS_SLAVE|MS_REC failed: %m");
1308 /* Turn directory into bind mount */
1309 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1310 log_error("Failed to make bind mount.");
1315 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1316 log_error("Failed to make read-only.");
1320 if (mount_all(arg_directory) < 0)
1323 if (copy_devnodes(arg_directory) < 0)
1326 dev_setup(arg_directory);
1328 if (setup_dev_console(arg_directory, console) < 0)
1331 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1334 close_nointr_nofail(kmsg_socket_pair[1]);
1335 kmsg_socket_pair[1] = -1;
1337 if (setup_boot_id(arg_directory) < 0)
1340 if (setup_timezone(arg_directory) < 0)
1343 if (setup_resolv_conf(arg_directory) < 0)
1346 if (setup_journal(arg_directory) < 0)
1349 if (chdir(arg_directory) < 0) {
1350 log_error("chdir(%s) failed: %m", arg_directory);
1354 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1355 log_error("mount(MS_MOVE) failed: %m");
1359 if (chroot(".") < 0) {
1360 log_error("chroot() failed: %m");
1364 if (chdir("/") < 0) {
1365 log_error("chdir() failed: %m");
1373 if (drop_capabilities() < 0) {
1374 log_error("drop_capabilities() failed: %m");
1380 /* Note that this resolves user names
1381 * inside the container, and hence
1382 * accesses the NSS modules from the
1383 * container and not the host. This is
1386 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1387 log_error("get_user_creds() failed: %m");
1391 if (mkdir_parents_label(home, 0775) < 0) {
1392 log_error("mkdir_parents_label() failed: %m");
1396 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1397 log_error("mkdir_safe_label() failed: %m");
1401 if (initgroups((const char*)arg_user, gid) < 0) {
1402 log_error("initgroups() failed: %m");
1406 if (setresgid(gid, gid, gid) < 0) {
1407 log_error("setregid() failed: %m");
1411 if (setresuid(uid, uid, uid) < 0) {
1412 log_error("setreuid() failed: %m");
1416 /* Reset everything fully to 0, just in case */
1418 if (setgroups(0, NULL) < 0) {
1419 log_error("setgroups() failed: %m");
1423 if (setresgid(0, 0, 0) < 0) {
1424 log_error("setregid() failed: %m");
1428 if (setresuid(0, 0, 0) < 0) {
1429 log_error("setreuid() failed: %m");
1434 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1435 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1436 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1442 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1448 if (fdset_size(fds) > 0) {
1449 k = fdset_cloexec(fds, false);
1451 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1455 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1456 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1468 /* Automatically search for the init system */
1470 l = 1 + argc - optind;
1471 a = newa(char*, l + 1);
1472 memcpy(a + 1, argv + optind, l * sizeof(char*));
1474 a[0] = (char*) "/usr/lib/systemd/systemd";
1475 execve(a[0], a, (char**) envp);
1477 a[0] = (char*) "/lib/systemd/systemd";
1478 execve(a[0], a, (char**) envp);
1480 a[0] = (char*) "/sbin/init";
1481 execve(a[0], a, (char**) envp);
1482 } else if (argc > optind)
1483 execvpe(argv[optind], argv + optind, (char**) envp);
1485 chdir(home ? home : "/root");
1486 execle("/bin/bash", "-bash", NULL, (char**) envp);
1489 log_error("execv() failed: %m");
1492 _exit(EXIT_FAILURE);
1495 log_info("Init process in the container running as PID %d", pid);
1496 close_nointr_nofail(pipefd[0]);
1497 close_nointr_nofail(pipefd[1]);
1502 if (process_pty(master, pid, &mask) < 0)
1505 if (saved_attr_valid)
1506 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1508 r = wait_for_terminate(pid, &status);
1514 if (status.si_code == CLD_EXITED) {
1515 if (status.si_status != 0) {
1516 log_error("Container failed with error code %i.", status.si_status);
1517 r = status.si_status;
1521 log_debug("Container exited successfully.");
1523 } else if (status.si_code == CLD_KILLED &&
1524 status.si_status == SIGINT) {
1525 log_info("Container has been shut down.");
1528 } else if (status.si_code == CLD_KILLED &&
1529 status.si_status == SIGHUP) {
1530 log_info("Container is being rebooted.");
1532 } else if (status.si_code == CLD_KILLED ||
1533 status.si_code == CLD_DUMPED) {
1535 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1539 log_error("Container failed due to unknown reason.");
1546 if (saved_attr_valid)
1547 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1550 close_nointr_nofail(master);
1552 close_pipe(kmsg_socket_pair);
1555 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1558 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1560 free(arg_directory);
1561 strv_free(arg_controllers);