1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "loopback-setup.h"
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
60 static int help(void) {
62 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
63 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
64 " -h --help Show this help\n"
65 " -D --directory=NAME Root directory for the container\n"
66 " -u --user=USER Run the command under specified user or uid\n"
67 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
68 " --private-network Disable network in container\n",
69 program_invocation_short_name);
74 static int parse_argv(int argc, char *argv[]) {
77 ARG_PRIVATE_NETWORK = 0x100
80 static const struct option options[] = {
81 { "help", no_argument, NULL, 'h' },
82 { "directory", required_argument, NULL, 'D' },
83 { "user", required_argument, NULL, 'u' },
84 { "controllers", required_argument, NULL, 'C' },
85 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
94 while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
104 if (!(arg_directory = strdup(optarg))) {
105 log_error("Failed to duplicate root directory.");
113 if (!(arg_user = strdup(optarg))) {
114 log_error("Failed to duplicate user name.");
121 strv_free(arg_controllers);
122 arg_controllers = strv_split(optarg, ",");
123 if (!arg_controllers) {
124 log_error("Failed to split controllers list.");
127 strv_uniq(arg_controllers);
131 case ARG_PRIVATE_NETWORK:
132 arg_private_network = true;
139 log_error("Unknown option code %c", c);
147 static int mount_all(const char *dest) {
149 typedef struct MountPoint {
158 static const MountPoint mount_table[] = {
159 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
160 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
161 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
162 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
163 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
164 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
165 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
166 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
168 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
169 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
177 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
180 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
181 log_error("Out of memory");
189 t = path_is_mount_point(where, false);
191 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
200 mkdir_p(where, 0755);
202 if (mount(mount_table[k].what,
205 mount_table[k].flags,
206 mount_table[k].options) < 0 &&
207 mount_table[k].fatal) {
209 log_error("mount(%s) failed: %m", where);
221 static int setup_timezone(const char *dest) {
226 /* Fix the timezone, if possible */
227 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
228 log_error("Out of memory");
232 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
233 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
237 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
238 log_error("Out of memory");
242 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
243 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
250 static int copy_devnodes(const char *dest) {
252 static const char devnodes[] =
270 NULSTR_FOREACH(d, devnodes) {
272 char *from = NULL, *to = NULL;
274 asprintf(&from, "/dev/%s", d);
275 asprintf(&to, "%s/dev/%s", dest, d);
278 log_error("Failed to allocate devnode path");
291 if (stat(from, &st) < 0) {
293 if (errno != ENOENT) {
294 log_error("Failed to stat %s: %m", from);
299 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
301 log_error("%s is not a char or block device, cannot copy.", from);
305 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
307 log_error("mknod(%s) failed: %m", dest);
321 static int setup_dev_console(const char *dest, const char *console) {
332 if (stat(console, &st) < 0) {
333 log_error("Failed to stat %s: %m", console);
337 } else if (!S_ISCHR(st.st_mode)) {
338 log_error("/dev/console is not a char device.");
343 r = chmod_and_chown(console, 0600, 0, 0);
345 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
349 if (asprintf(&to, "%s/dev/console", dest) < 0) {
350 log_error("Out of memory");
355 /* We need to bind mount the right tty to /dev/console since
356 * ptys can only exist on pts file systems. To have something
357 * to bind mount things on we create a device node first, that
358 * has the right major/minor (note that the major minor
359 * doesn't actually matter here, since we mount it over
362 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
363 log_error("mknod() for /dev/console failed: %m");
368 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
369 log_error("Bind mount for /dev/console failed: %m");
381 static int setup_kmsg(const char *dest, int kmsg_socket) {
382 char *from = NULL, *to = NULL;
386 struct cmsghdr cmsghdr;
387 uint8_t buf[CMSG_SPACE(sizeof(int))];
390 struct cmsghdr *cmsg;
393 assert(kmsg_socket >= 0);
397 /* We create the kmsg FIFO as /dev/kmsg, but immediately
398 * delete it after bind mounting it to /proc/kmsg. While FIFOs
399 * on the reading side behave very similar to /proc/kmsg,
400 * their writing side behaves differently from /dev/kmsg in
401 * that writing blocks when nothing is reading. In order to
402 * avoid any problems with containers deadlocking due to this
403 * we simply make /dev/kmsg unavailable to the container. */
404 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
405 log_error("Out of memory");
410 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
411 log_error("Out of memory");
416 if (mkfifo(from, 0600) < 0) {
417 log_error("mkfifo() for /dev/kmsg failed: %m");
422 r = chmod_and_chown(from, 0600, 0, 0);
424 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
428 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
429 log_error("Bind mount for /proc/kmsg failed: %m");
434 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
436 log_error("Failed to open fifo: %m");
444 mh.msg_control = &control;
445 mh.msg_controllen = sizeof(control);
447 cmsg = CMSG_FIRSTHDR(&mh);
448 cmsg->cmsg_level = SOL_SOCKET;
449 cmsg->cmsg_type = SCM_RIGHTS;
450 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
451 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
453 mh.msg_controllen = cmsg->cmsg_len;
455 /* Store away the fd in the socket, so that it stays open as
456 * long as we run the child */
457 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
458 close_nointr_nofail(fd);
461 log_error("Failed to send FIFO fd: %m");
466 /* And now make the FIFO unavailable as /dev/kmsg... */
477 static int drop_capabilities(void) {
478 static const unsigned long retain[] = {
488 CAP_NET_BIND_SERVICE,
504 for (l = 0; l <= cap_last_cap(); l++) {
507 for (i = 0; i < ELEMENTSOF(retain); i++)
511 if (i < ELEMENTSOF(retain))
514 if (prctl(PR_CAPBSET_DROP, l) < 0) {
515 log_error("PR_CAPBSET_DROP failed: %m");
523 static int is_os_tree(const char *path) {
526 /* We use /bin/sh as flag file if something is an OS */
528 if (asprintf(&p, "%s/bin/sh", path) < 0)
534 return r < 0 ? 0 : 1;
537 static int process_pty(int master, sigset_t *mask) {
539 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
540 size_t in_buffer_full = 0, out_buffer_full = 0;
541 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
542 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
543 int ep = -1, signal_fd = -1, r;
545 fd_nonblock(STDIN_FILENO, 1);
546 fd_nonblock(STDOUT_FILENO, 1);
547 fd_nonblock(master, 1);
549 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
550 log_error("signalfd(): %m");
555 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
556 log_error("Failed to create epoll: %m");
562 stdin_ev.events = EPOLLIN|EPOLLET;
563 stdin_ev.data.fd = STDIN_FILENO;
566 stdout_ev.events = EPOLLOUT|EPOLLET;
567 stdout_ev.data.fd = STDOUT_FILENO;
570 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
571 master_ev.data.fd = master;
574 signal_ev.events = EPOLLIN;
575 signal_ev.data.fd = signal_fd;
577 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
578 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
579 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
580 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
581 log_error("Failed to regiser fds in epoll: %m");
587 struct epoll_event ev[16];
591 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
593 if (errno == EINTR || errno == EAGAIN)
596 log_error("epoll_wait(): %m");
603 for (i = 0; i < nfds; i++) {
604 if (ev[i].data.fd == STDIN_FILENO) {
606 if (ev[i].events & (EPOLLIN|EPOLLHUP))
607 stdin_readable = true;
609 } else if (ev[i].data.fd == STDOUT_FILENO) {
611 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
612 stdout_writable = true;
614 } else if (ev[i].data.fd == master) {
616 if (ev[i].events & (EPOLLIN|EPOLLHUP))
617 master_readable = true;
619 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
620 master_writable = true;
622 } else if (ev[i].data.fd == signal_fd) {
623 struct signalfd_siginfo sfsi;
626 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
629 log_error("Failed to read from signalfd: invalid block size");
634 if (errno != EINTR && errno != EAGAIN) {
635 log_error("Failed to read from signalfd: %m");
641 if (sfsi.ssi_signo == SIGWINCH) {
644 /* The window size changed, let's forward that. */
645 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
646 ioctl(master, TIOCSWINSZ, &ws);
655 while ((stdin_readable && in_buffer_full <= 0) ||
656 (master_writable && in_buffer_full > 0) ||
657 (master_readable && out_buffer_full <= 0) ||
658 (stdout_writable && out_buffer_full > 0)) {
660 if (stdin_readable && in_buffer_full < LINE_MAX) {
662 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
664 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
665 stdin_readable = false;
667 log_error("read(): %m");
672 in_buffer_full += (size_t) k;
675 if (master_writable && in_buffer_full > 0) {
677 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
679 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
680 master_writable = false;
682 log_error("write(): %m");
688 assert(in_buffer_full >= (size_t) k);
689 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
694 if (master_readable && out_buffer_full < LINE_MAX) {
696 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
698 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
699 master_readable = false;
701 log_error("read(): %m");
706 out_buffer_full += (size_t) k;
709 if (stdout_writable && out_buffer_full > 0) {
711 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
713 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
714 stdout_writable = false;
716 log_error("write(): %m");
722 assert(out_buffer_full >= (size_t) k);
723 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
724 out_buffer_full -= k;
732 close_nointr_nofail(ep);
735 close_nointr_nofail(signal_fd);
740 int main(int argc, char *argv[]) {
742 int r = EXIT_FAILURE, k;
743 char *oldcg = NULL, *newcg = NULL;
744 char **controller = NULL;
746 const char *console = NULL;
747 struct termios saved_attr, raw_attr;
749 bool saved_attr_valid = false;
751 int kmsg_socket_pair[2] = { -1, -1 };
753 log_parse_environment();
756 if ((r = parse_argv(argc, argv)) <= 0)
762 p = path_make_absolute_cwd(arg_directory);
766 arg_directory = get_current_dir_name();
768 if (!arg_directory) {
769 log_error("Failed to determine path");
773 path_kill_slashes(arg_directory);
775 if (geteuid() != 0) {
776 log_error("Need to be root.");
780 if (sd_booted() <= 0) {
781 log_error("Not running on a systemd system.");
785 if (path_equal(arg_directory, "/")) {
786 log_error("Spawning container on root directory not supported.");
790 if (is_os_tree(arg_directory) <= 0) {
791 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
795 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
796 log_error("Failed to determine current cgroup: %s", strerror(-k));
800 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
801 log_error("Failed to allocate cgroup path.");
805 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
807 log_error("Failed to create cgroup: %s", strerror(-k));
811 STRV_FOREACH(controller,arg_controllers) {
812 k = cg_create_and_attach(*controller, newcg, 0);
814 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
817 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
818 log_error("Failed to acquire pseudo tty: %m");
822 if (!(console = ptsname(master))) {
823 log_error("Failed to determine tty name: %m");
827 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
829 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
830 ioctl(master, TIOCSWINSZ, &ws);
832 if (unlockpt(master) < 0) {
833 log_error("Failed to unlock tty: %m");
837 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
838 log_error("Failed to get terminal attributes: %m");
842 saved_attr_valid = true;
844 raw_attr = saved_attr;
845 cfmakeraw(&raw_attr);
846 raw_attr.c_lflag &= ~ECHO;
848 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
849 log_error("Failed to set terminal attributes: %m");
853 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
854 log_error("Failed to create kmsg socket pair");
858 assert_se(sigemptyset(&mask) == 0);
859 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
860 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
862 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
865 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
867 log_error("clone() failed: %m");
876 const char *home = NULL;
877 uid_t uid = (uid_t) -1;
878 gid_t gid = (gid_t) -1;
879 const char *envp[] = {
880 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
881 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
889 envp[2] = strv_find_prefix(environ, "TERM=");
891 close_nointr_nofail(master);
893 close_nointr(STDIN_FILENO);
894 close_nointr(STDOUT_FILENO);
895 close_nointr(STDERR_FILENO);
897 close_all_fds(&kmsg_socket_pair[1], 1);
899 reset_all_signal_handlers();
901 assert_se(sigemptyset(&mask) == 0);
902 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
907 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
910 /* Mark / as private, in case somebody marked it shared */
911 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
914 if (mount_all(arg_directory) < 0)
917 if (copy_devnodes(arg_directory) < 0)
920 if (setup_dev_console(arg_directory, console) < 0)
923 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
926 close_nointr_nofail(kmsg_socket_pair[1]);
928 if (setup_timezone(arg_directory) < 0)
931 if (chdir(arg_directory) < 0) {
932 log_error("chdir(%s) failed: %m", arg_directory);
936 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
937 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
938 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
941 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
942 log_error("mount(MS_MOVE) failed: %m");
946 if (chroot(".") < 0) {
947 log_error("chroot() failed: %m");
951 if (chdir("/") < 0) {
952 log_error("chdir() failed: %m");
960 if (drop_capabilities() < 0)
965 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
966 log_error("get_user_creds() failed: %m");
970 if (mkdir_parents(home, 0775) < 0) {
971 log_error("mkdir_parents() failed: %m");
975 if (safe_mkdir(home, 0775, uid, gid) < 0) {
976 log_error("safe_mkdir() failed: %m");
980 if (initgroups((const char*)arg_user, gid) < 0) {
981 log_error("initgroups() failed: %m");
985 if (setresgid(gid, gid, gid) < 0) {
986 log_error("setregid() failed: %m");
990 if (setresuid(uid, uid, uid) < 0) {
991 log_error("setreuid() failed: %m");
996 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
997 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
998 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
999 log_error("Out of memory");
1003 if ((hn = file_name_from_path(arg_directory)))
1004 sethostname(hn, strlen(hn));
1007 execvpe(argv[optind], argv + optind, (char**) envp);
1009 chdir(home ? home : "/root");
1010 execle("/bin/bash", "-bash", NULL, (char**) envp);
1013 log_error("execv() failed: %m");
1016 _exit(EXIT_FAILURE);
1019 if (process_pty(master, &mask) < 0)
1022 if (saved_attr_valid) {
1023 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1024 saved_attr_valid = false;
1027 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1033 if (saved_attr_valid)
1034 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1037 close_nointr_nofail(master);
1039 close_pipe(kmsg_socket_pair);
1042 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1045 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1047 free(arg_directory);
1048 strv_free(arg_controllers);