1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
62 typedef enum LinkJournal {
69 static char *arg_directory = NULL;
70 static char *arg_user = NULL;
71 static char **arg_controllers = NULL;
72 static char *arg_uuid = NULL;
73 static bool arg_private_network = false;
74 static bool arg_read_only = false;
75 static bool arg_boot = false;
76 static LinkJournal arg_link_journal = LINK_AUTO;
77 static uint64_t arg_retain =
79 (1ULL << CAP_DAC_OVERRIDE) |
80 (1ULL << CAP_DAC_READ_SEARCH) |
81 (1ULL << CAP_FOWNER) |
82 (1ULL << CAP_FSETID) |
83 (1ULL << CAP_IPC_OWNER) |
86 (1ULL << CAP_LINUX_IMMUTABLE) |
87 (1ULL << CAP_NET_BIND_SERVICE) |
88 (1ULL << CAP_NET_BROADCAST) |
89 (1ULL << CAP_NET_RAW) |
90 (1ULL << CAP_SETGID) |
91 (1ULL << CAP_SETFCAP) |
92 (1ULL << CAP_SETPCAP) |
93 (1ULL << CAP_SETUID) |
94 (1ULL << CAP_SYS_ADMIN) |
95 (1ULL << CAP_SYS_CHROOT) |
96 (1ULL << CAP_SYS_NICE) |
97 (1ULL << CAP_SYS_PTRACE) |
98 (1ULL << CAP_SYS_TTY_CONFIG) |
99 (1ULL << CAP_SYS_RESOURCE) |
100 (1ULL << CAP_SYS_BOOT) |
101 (1ULL << CAP_AUDIT_WRITE) |
102 (1ULL << CAP_AUDIT_CONTROL);
104 static int help(void) {
106 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
107 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
108 " -h --help Show this help\n"
109 " --version Print version string\n"
110 " -D --directory=NAME Root directory for the container\n"
111 " -b --boot Boot up full system (i.e. invoke init)\n"
112 " -u --user=USER Run the command under specified user or uid\n"
113 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
114 " --uuid=UUID Set a specific machine UUID for the container\n"
115 " --private-network Disable network in container\n"
116 " --read-only Mount the root directory read-only\n"
117 " --capability=CAP In addition to the default, retain specified capability\n"
118 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
119 " -j Equivalent to --link-journal=host\n",
120 program_invocation_short_name);
125 static int parse_argv(int argc, char *argv[]) {
136 static const struct option options[] = {
137 { "help", no_argument, NULL, 'h' },
138 { "version", no_argument, NULL, ARG_VERSION },
139 { "directory", required_argument, NULL, 'D' },
140 { "user", required_argument, NULL, 'u' },
141 { "controllers", required_argument, NULL, 'C' },
142 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
143 { "boot", no_argument, NULL, 'b' },
144 { "uuid", required_argument, NULL, ARG_UUID },
145 { "read-only", no_argument, NULL, ARG_READ_ONLY },
146 { "capability", required_argument, NULL, ARG_CAPABILITY },
147 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
156 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
165 puts(PACKAGE_STRING);
166 puts(SYSTEMD_FEATURES);
171 arg_directory = canonicalize_file_name(optarg);
172 if (!arg_directory) {
173 log_error("Failed to canonicalize root directory.");
181 if (!(arg_user = strdup(optarg))) {
182 log_error("Failed to duplicate user name.");
189 strv_free(arg_controllers);
190 arg_controllers = strv_split(optarg, ",");
191 if (!arg_controllers) {
192 log_error("Failed to split controllers list.");
195 strv_uniq(arg_controllers);
199 case ARG_PRIVATE_NETWORK:
200 arg_private_network = true;
212 arg_read_only = true;
215 case ARG_CAPABILITY: {
219 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
223 t = strndup(word, length);
227 if (cap_from_name(t, &cap) < 0) {
228 log_error("Failed to parse capability %s.", t);
234 arg_retain |= 1ULL << (uint64_t) cap;
241 arg_link_journal = LINK_GUEST;
244 case ARG_LINK_JOURNAL:
245 if (streq(optarg, "auto"))
246 arg_link_journal = LINK_AUTO;
247 else if (streq(optarg, "no"))
248 arg_link_journal = LINK_NO;
249 else if (streq(optarg, "guest"))
250 arg_link_journal = LINK_GUEST;
251 else if (streq(optarg, "host"))
252 arg_link_journal = LINK_HOST;
254 log_error("Failed to parse link journal mode %s", optarg);
264 log_error("Unknown option code %c", c);
272 static int mount_all(const char *dest) {
274 typedef struct MountPoint {
283 static const MountPoint mount_table[] = {
284 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
285 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
286 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
287 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
288 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
289 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
290 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
291 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
293 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
294 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
301 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
302 char _cleanup_free_ *where = NULL;
305 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
314 t = path_is_mount_point(where, true);
316 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
324 /* Skip this entry if it is not a remount. */
325 if (mount_table[k].what && t > 0)
328 mkdir_p_label(where, 0755);
330 if (mount(mount_table[k].what,
333 mount_table[k].flags,
334 mount_table[k].options) < 0 &&
335 mount_table[k].fatal) {
337 log_error("mount(%s) failed: %m", where);
347 static int setup_timezone(const char *dest) {
348 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
354 /* Fix the timezone, if possible */
355 r = readlink_malloc("/etc/localtime", &p);
357 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
361 z = path_startswith(p, "../usr/share/zoneinfo/");
363 z = path_startswith(p, "/usr/share/zoneinfo/");
365 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
369 where = strappend(dest, "/etc/localtime");
373 r = readlink_malloc(where, &q);
375 y = path_startswith(q, "../usr/share/zoneinfo/");
377 y = path_startswith(q, "/usr/share/zoneinfo/");
380 /* Already pointing to the right place? Then do nothing .. */
381 if (y && streq(y, z))
385 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
389 if (access(check, F_OK) < 0) {
390 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
394 what = strappend("../usr/share/zoneinfo/", z);
399 if (symlink(what, where) < 0) {
400 log_error("Failed to correct timezone of container: %m");
407 static int setup_resolv_conf(const char *dest) {
412 if (arg_private_network)
415 /* Fix resolv.conf, if possible */
416 where = strappend(dest, "/etc/resolv.conf");
420 /* We don't really care for the results of this really. If it
421 * fails, it fails, but meh... */
422 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
423 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
430 static int setup_boot_id(const char *dest) {
431 char _cleanup_free_ *from = NULL, *to = NULL;
438 /* Generate a new randomized boot ID, so that each boot-up of
439 * the container gets a new one */
441 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
442 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
446 r = sd_id128_randomize(&rnd);
448 log_error("Failed to generate random boot id: %s", strerror(-r));
452 snprintf(as_uuid, sizeof(as_uuid),
453 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
454 SD_ID128_FORMAT_VAL(rnd));
455 char_array_0(as_uuid);
457 r = write_one_line_file(from, as_uuid);
459 log_error("Failed to write boot id: %s", strerror(-r));
463 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
464 log_error("Failed to bind mount boot id: %m");
467 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
473 static int copy_devnodes(const char *dest) {
475 static const char devnodes[] =
486 mode_t _cleanup_umask_ u;
492 NULSTR_FOREACH(d, devnodes) {
494 char _cleanup_free_ *from = NULL, *to = NULL;
496 asprintf(&from, "/dev/%s", d);
497 asprintf(&to, "%s/dev/%s", dest, d);
508 if (stat(from, &st) < 0) {
510 if (errno != ENOENT) {
511 log_error("Failed to stat %s: %m", from);
516 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
518 log_error("%s is not a char or block device, cannot copy", from);
522 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
524 log_error("mknod(%s) failed: %m", dest);
533 static int setup_dev_console(const char *dest, const char *console) {
535 char _cleanup_free_ *to = NULL;
537 mode_t _cleanup_umask_ u;
544 if (stat(console, &st) < 0) {
545 log_error("Failed to stat %s: %m", console);
548 } else if (!S_ISCHR(st.st_mode)) {
549 log_error("/dev/console is not a char device");
553 r = chmod_and_chown(console, 0600, 0, 0);
555 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
559 if (asprintf(&to, "%s/dev/console", dest) < 0)
562 /* We need to bind mount the right tty to /dev/console since
563 * ptys can only exist on pts file systems. To have something
564 * to bind mount things on we create a device node first, that
565 * has the right major/minor (note that the major minor
566 * doesn't actually matter here, since we mount it over
569 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
570 log_error("mknod() for /dev/console failed: %m");
574 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
575 log_error("Bind mount for /dev/console failed: %m");
582 static int setup_kmsg(const char *dest, int kmsg_socket) {
583 char _cleanup_free_ *from = NULL, *to = NULL;
585 mode_t _cleanup_umask_ u;
587 struct cmsghdr cmsghdr;
588 uint8_t buf[CMSG_SPACE(sizeof(int))];
591 struct cmsghdr *cmsg;
594 assert(kmsg_socket >= 0);
598 /* We create the kmsg FIFO as /dev/kmsg, but immediately
599 * delete it after bind mounting it to /proc/kmsg. While FIFOs
600 * on the reading side behave very similar to /proc/kmsg,
601 * their writing side behaves differently from /dev/kmsg in
602 * that writing blocks when nothing is reading. In order to
603 * avoid any problems with containers deadlocking due to this
604 * we simply make /dev/kmsg unavailable to the container. */
605 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
606 asprintf(&to, "%s/proc/kmsg", dest) < 0)
609 if (mkfifo(from, 0600) < 0) {
610 log_error("mkfifo() for /dev/kmsg failed: %m");
614 r = chmod_and_chown(from, 0600, 0, 0);
616 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
620 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
621 log_error("Bind mount for /proc/kmsg failed: %m");
625 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
627 log_error("Failed to open fifo: %m");
634 mh.msg_control = &control;
635 mh.msg_controllen = sizeof(control);
637 cmsg = CMSG_FIRSTHDR(&mh);
638 cmsg->cmsg_level = SOL_SOCKET;
639 cmsg->cmsg_type = SCM_RIGHTS;
640 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
641 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
643 mh.msg_controllen = cmsg->cmsg_len;
645 /* Store away the fd in the socket, so that it stays open as
646 * long as we run the child */
647 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
648 close_nointr_nofail(fd);
651 log_error("Failed to send FIFO fd: %m");
655 /* And now make the FIFO unavailable as /dev/kmsg... */
660 static int setup_hostname(void) {
664 hn = path_get_file_name(arg_directory);
670 hostname_cleanup(hn);
673 if (sethostname(hn, strlen(hn)) < 0)
682 static int setup_journal(const char *directory) {
683 sd_id128_t machine_id;
684 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
688 if (arg_link_journal == LINK_NO)
691 p = strappend(directory, "/etc/machine-id");
695 r = read_one_line_file(p, &b);
696 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
699 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
704 if (isempty(id) && arg_link_journal == LINK_AUTO)
707 /* Verify validity */
708 r = sd_id128_from_string(id, &machine_id);
710 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
715 p = strappend("/var/log/journal/", id);
716 q = strjoin(directory, "/var/log/journal/", id, NULL);
720 if (path_is_mount_point(p, false) > 0) {
721 if (arg_link_journal != LINK_AUTO) {
722 log_error("%s: already a mount point, refusing to use for journal", p);
729 if (path_is_mount_point(q, false) > 0) {
730 if (arg_link_journal != LINK_AUTO) {
731 log_error("%s: already a mount point, refusing to use for journal", q);
738 r = readlink_and_make_absolute(p, &d);
740 if ((arg_link_journal == LINK_GUEST ||
741 arg_link_journal == LINK_AUTO) &&
744 r = mkdir_p(q, 0755);
746 log_warning("failed to create directory %s: %m", q);
751 log_error("Failed to remove symlink %s: %m", p);
754 } else if (r == -EINVAL) {
756 if (arg_link_journal == LINK_GUEST &&
759 if (errno == ENOTDIR) {
760 log_error("%s already exists and is neither a symlink nor a directory", p);
763 log_error("Failed to remove %s: %m", p);
767 } else if (r != -ENOENT) {
768 log_error("readlink(%s) failed: %m", p);
772 if (arg_link_journal == LINK_GUEST) {
774 if (symlink(q, p) < 0) {
775 log_error("Failed to symlink %s to %s: %m", q, p);
779 r = mkdir_p(q, 0755);
781 log_warning("failed to create directory %s: %m", q);
785 if (arg_link_journal == LINK_HOST) {
786 r = mkdir_p(p, 0755);
788 log_error("Failed to create %s: %m", p);
792 } else if (access(p, F_OK) < 0)
795 if (dir_is_empty(q) == 0) {
796 log_error("%s not empty.", q);
800 r = mkdir_p(q, 0755);
802 log_error("Failed to create %s: %m", q);
806 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
807 log_error("Failed to bind mount journal from host into guest: %m");
814 static int drop_capabilities(void) {
815 return capability_bounding_set_drop(~arg_retain, false);
818 static int is_os_tree(const char *path) {
821 /* We use /bin/sh as flag file if something is an OS */
823 if (asprintf(&p, "%s/bin/sh", path) < 0)
829 return r < 0 ? 0 : 1;
832 static int process_pty(int master, pid_t pid, sigset_t *mask) {
834 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
835 size_t in_buffer_full = 0, out_buffer_full = 0;
836 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
837 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
838 int ep = -1, signal_fd = -1, r;
839 bool tried_orderly_shutdown = false;
845 fd_nonblock(STDIN_FILENO, 1);
846 fd_nonblock(STDOUT_FILENO, 1);
847 fd_nonblock(master, 1);
849 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
851 log_error("signalfd(): %m");
856 ep = epoll_create1(EPOLL_CLOEXEC);
858 log_error("Failed to create epoll: %m");
863 /* We read from STDIN only if this is actually a TTY,
864 * otherwise we assume non-interactivity. */
865 if (isatty(STDIN_FILENO)) {
867 stdin_ev.events = EPOLLIN|EPOLLET;
868 stdin_ev.data.fd = STDIN_FILENO;
870 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
871 log_error("Failed to register STDIN in epoll: %m");
878 stdout_ev.events = EPOLLOUT|EPOLLET;
879 stdout_ev.data.fd = STDOUT_FILENO;
882 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
883 master_ev.data.fd = master;
886 signal_ev.events = EPOLLIN;
887 signal_ev.data.fd = signal_fd;
889 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
890 if (errno != EPERM) {
891 log_error("Failed to register stdout in epoll: %m");
895 /* stdout without epoll support. Likely redirected to regular file. */
896 stdout_writable = true;
899 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
900 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
901 log_error("Failed to register fds in epoll: %m");
907 struct epoll_event ev[16];
911 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
914 if (errno == EINTR || errno == EAGAIN)
917 log_error("epoll_wait(): %m");
924 for (i = 0; i < nfds; i++) {
925 if (ev[i].data.fd == STDIN_FILENO) {
927 if (ev[i].events & (EPOLLIN|EPOLLHUP))
928 stdin_readable = true;
930 } else if (ev[i].data.fd == STDOUT_FILENO) {
932 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
933 stdout_writable = true;
935 } else if (ev[i].data.fd == master) {
937 if (ev[i].events & (EPOLLIN|EPOLLHUP))
938 master_readable = true;
940 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
941 master_writable = true;
943 } else if (ev[i].data.fd == signal_fd) {
944 struct signalfd_siginfo sfsi;
947 n = read(signal_fd, &sfsi, sizeof(sfsi));
948 if (n != sizeof(sfsi)) {
951 log_error("Failed to read from signalfd: invalid block size");
956 if (errno != EINTR && errno != EAGAIN) {
957 log_error("Failed to read from signalfd: %m");
963 if (sfsi.ssi_signo == SIGWINCH) {
966 /* The window size changed, let's forward that. */
967 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
968 ioctl(master, TIOCSWINSZ, &ws);
969 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
971 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
973 /* This only works for systemd... */
974 tried_orderly_shutdown = true;
975 kill(pid, SIGRTMIN+3);
985 while ((stdin_readable && in_buffer_full <= 0) ||
986 (master_writable && in_buffer_full > 0) ||
987 (master_readable && out_buffer_full <= 0) ||
988 (stdout_writable && out_buffer_full > 0)) {
990 if (stdin_readable && in_buffer_full < LINE_MAX) {
992 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
995 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
996 stdin_readable = false;
998 log_error("read(): %m");
1003 in_buffer_full += (size_t) k;
1006 if (master_writable && in_buffer_full > 0) {
1008 k = write(master, in_buffer, in_buffer_full);
1011 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1012 master_writable = false;
1014 log_error("write(): %m");
1020 assert(in_buffer_full >= (size_t) k);
1021 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1022 in_buffer_full -= k;
1026 if (master_readable && out_buffer_full < LINE_MAX) {
1028 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1031 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1032 master_readable = false;
1034 log_error("read(): %m");
1039 out_buffer_full += (size_t) k;
1042 if (stdout_writable && out_buffer_full > 0) {
1044 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1047 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1048 stdout_writable = false;
1050 log_error("write(): %m");
1056 assert(out_buffer_full >= (size_t) k);
1057 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1058 out_buffer_full -= k;
1066 close_nointr_nofail(ep);
1069 close_nointr_nofail(signal_fd);
1074 int main(int argc, char *argv[]) {
1076 int r = EXIT_FAILURE, k;
1077 char *oldcg = NULL, *newcg = NULL;
1078 char **controller = NULL;
1079 int master = -1, n_fd_passed;
1080 const char *console = NULL;
1081 struct termios saved_attr, raw_attr;
1083 bool saved_attr_valid = false;
1085 int kmsg_socket_pair[2] = { -1, -1 };
1088 log_parse_environment();
1091 r = parse_argv(argc, argv);
1095 if (arg_directory) {
1098 p = path_make_absolute_cwd(arg_directory);
1099 free(arg_directory);
1102 arg_directory = get_current_dir_name();
1104 if (!arg_directory) {
1105 log_error("Failed to determine path");
1109 path_kill_slashes(arg_directory);
1111 if (geteuid() != 0) {
1112 log_error("Need to be root.");
1116 if (sd_booted() <= 0) {
1117 log_error("Not running on a systemd system.");
1121 if (path_equal(arg_directory, "/")) {
1122 log_error("Spawning container on root directory not supported.");
1126 if (is_os_tree(arg_directory) <= 0) {
1127 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1132 n_fd_passed = sd_listen_fds(false);
1133 if (n_fd_passed > 0) {
1134 k = fdset_new_listen_fds(&fds, false);
1136 log_error("Failed to collect file descriptors: %s", strerror(-k));
1140 fdset_close_others(fds);
1143 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1145 log_error("Failed to determine current cgroup: %s", strerror(-k));
1149 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1150 log_error("Failed to allocate cgroup path.");
1154 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1156 log_error("Failed to create cgroup: %s", strerror(-k));
1160 STRV_FOREACH(controller, arg_controllers) {
1161 k = cg_create_and_attach(*controller, newcg, 0);
1163 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1166 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1168 log_error("Failed to acquire pseudo tty: %m");
1172 console = ptsname(master);
1174 log_error("Failed to determine tty name: %m");
1178 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1180 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1181 ioctl(master, TIOCSWINSZ, &ws);
1183 if (unlockpt(master) < 0) {
1184 log_error("Failed to unlock tty: %m");
1188 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1189 saved_attr_valid = true;
1191 raw_attr = saved_attr;
1192 cfmakeraw(&raw_attr);
1193 raw_attr.c_lflag &= ~ECHO;
1196 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1197 log_error("Failed to create kmsg socket pair");
1201 assert_se(sigemptyset(&mask) == 0);
1202 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1203 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1208 if (saved_attr_valid) {
1209 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1210 log_error("Failed to set terminal attributes: %m");
1215 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1217 if (errno == EINVAL)
1218 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1220 log_error("clone() failed: %m");
1228 const char *home = NULL;
1229 uid_t uid = (uid_t) -1;
1230 gid_t gid = (gid_t) -1;
1232 const char *envp[] = {
1233 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1234 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1239 NULL, /* container_uuid */
1240 NULL, /* LISTEN_FDS */
1241 NULL, /* LISTEN_PID */
1245 envp[2] = strv_find_prefix(environ, "TERM=");
1248 close_nointr_nofail(master);
1251 close_nointr(STDIN_FILENO);
1252 close_nointr(STDOUT_FILENO);
1253 close_nointr(STDERR_FILENO);
1255 close_nointr_nofail(kmsg_socket_pair[0]);
1256 kmsg_socket_pair[0] = -1;
1258 reset_all_signal_handlers();
1260 assert_se(sigemptyset(&mask) == 0);
1261 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1263 k = open_terminal(console, O_RDWR);
1264 if (k != STDIN_FILENO) {
1266 close_nointr_nofail(k);
1270 log_error("Failed to open console: %s", strerror(-k));
1274 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1275 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1276 log_error("Failed to duplicate console: %m");
1281 log_error("setsid() failed: %m");
1285 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1286 log_error("PR_SET_PDEATHSIG failed: %m");
1290 /* Mark everything as slave, so that we still
1291 * receive mounts from the real root, but don't
1292 * propagate mounts to the real root. */
1293 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1294 log_error("MS_SLAVE|MS_REC failed: %m");
1298 /* Turn directory into bind mount */
1299 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1300 log_error("Failed to make bind mount.");
1305 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1306 log_error("Failed to make read-only.");
1310 if (mount_all(arg_directory) < 0)
1313 if (copy_devnodes(arg_directory) < 0)
1316 dev_setup(arg_directory);
1318 if (setup_dev_console(arg_directory, console) < 0)
1321 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1324 close_nointr_nofail(kmsg_socket_pair[1]);
1325 kmsg_socket_pair[1] = -1;
1327 if (setup_boot_id(arg_directory) < 0)
1330 if (setup_timezone(arg_directory) < 0)
1333 if (setup_resolv_conf(arg_directory) < 0)
1336 if (setup_journal(arg_directory) < 0)
1339 if (chdir(arg_directory) < 0) {
1340 log_error("chdir(%s) failed: %m", arg_directory);
1344 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1345 log_error("mount(MS_MOVE) failed: %m");
1349 if (chroot(".") < 0) {
1350 log_error("chroot() failed: %m");
1354 if (chdir("/") < 0) {
1355 log_error("chdir() failed: %m");
1363 if (drop_capabilities() < 0) {
1364 log_error("drop_capabilities() failed: %m");
1370 /* Note that this resolves user names
1371 * inside the container, and hence
1372 * accesses the NSS modules from the
1373 * container and not the host. This is
1376 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1377 log_error("get_user_creds() failed: %m");
1381 if (mkdir_parents_label(home, 0775) < 0) {
1382 log_error("mkdir_parents_label() failed: %m");
1386 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1387 log_error("mkdir_safe_label() failed: %m");
1391 if (initgroups((const char*)arg_user, gid) < 0) {
1392 log_error("initgroups() failed: %m");
1396 if (setresgid(gid, gid, gid) < 0) {
1397 log_error("setregid() failed: %m");
1401 if (setresuid(uid, uid, uid) < 0) {
1402 log_error("setreuid() failed: %m");
1406 /* Reset everything fully to 0, just in case */
1408 if (setgroups(0, NULL) < 0) {
1409 log_error("setgroups() failed: %m");
1413 if (setresgid(0, 0, 0) < 0) {
1414 log_error("setregid() failed: %m");
1418 if (setresuid(0, 0, 0) < 0) {
1419 log_error("setreuid() failed: %m");
1424 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1425 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1426 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1432 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1438 if (fdset_size(fds) > 0) {
1439 k = fdset_cloexec(fds, false);
1441 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1445 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1446 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1458 /* Automatically search for the init system */
1460 l = 1 + argc - optind;
1461 a = newa(char*, l + 1);
1462 memcpy(a + 1, argv + optind, l * sizeof(char*));
1464 a[0] = (char*) "/usr/lib/systemd/systemd";
1465 execve(a[0], a, (char**) envp);
1467 a[0] = (char*) "/lib/systemd/systemd";
1468 execve(a[0], a, (char**) envp);
1470 a[0] = (char*) "/sbin/init";
1471 execve(a[0], a, (char**) envp);
1472 } else if (argc > optind)
1473 execvpe(argv[optind], argv + optind, (char**) envp);
1475 chdir(home ? home : "/root");
1476 execle("/bin/bash", "-bash", NULL, (char**) envp);
1479 log_error("execv() failed: %m");
1482 _exit(EXIT_FAILURE);
1488 if (process_pty(master, pid, &mask) < 0)
1491 if (saved_attr_valid)
1492 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1494 r = wait_for_terminate(pid, &status);
1500 if (status.si_code == CLD_EXITED) {
1501 if (status.si_status != 0) {
1502 log_error("Container failed with error code %i.", status.si_status);
1503 r = status.si_status;
1507 log_debug("Container exited successfully.");
1509 } else if (status.si_code == CLD_KILLED &&
1510 status.si_status == SIGINT) {
1511 log_info("Container has been shut down.");
1514 } else if (status.si_code == CLD_KILLED &&
1515 status.si_status == SIGHUP) {
1516 log_info("Container is being rebooted.");
1518 } else if (status.si_code == CLD_KILLED ||
1519 status.si_code == CLD_DUMPED) {
1521 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1525 log_error("Container failed due to unknown reason.");
1532 if (saved_attr_valid)
1533 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1536 close_nointr_nofail(master);
1538 close_pipe(kmsg_socket_pair);
1541 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1544 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1546 free(arg_directory);
1547 strv_free(arg_controllers);