1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "loopback-setup.h"
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
60 static int help(void) {
62 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
63 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
64 " -h --help Show this help\n"
65 " -D --directory=NAME Root directory for the container\n"
66 " -u --user=USER Run the command under specified user or uid\n"
67 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
68 " --private-network Disable network in container\n",
69 program_invocation_short_name);
74 static int parse_argv(int argc, char *argv[]) {
77 ARG_PRIVATE_NETWORK = 0x100
80 static const struct option options[] = {
81 { "help", no_argument, NULL, 'h' },
82 { "directory", required_argument, NULL, 'D' },
83 { "user", required_argument, NULL, 'u' },
84 { "controllers", required_argument, NULL, 'C' },
85 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
94 while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
104 arg_directory = canonicalize_file_name(optarg);
105 if (!arg_directory) {
106 log_error("Failed to canonicalize root directory.");
114 if (!(arg_user = strdup(optarg))) {
115 log_error("Failed to duplicate user name.");
122 strv_free(arg_controllers);
123 arg_controllers = strv_split(optarg, ",");
124 if (!arg_controllers) {
125 log_error("Failed to split controllers list.");
128 strv_uniq(arg_controllers);
132 case ARG_PRIVATE_NETWORK:
133 arg_private_network = true;
140 log_error("Unknown option code %c", c);
148 static int mount_all(const char *dest) {
150 typedef struct MountPoint {
159 static const MountPoint mount_table[] = {
160 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
161 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
162 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
163 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
164 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
165 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
166 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
167 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
169 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
170 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
178 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
181 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
182 log_error("Out of memory");
190 t = path_is_mount_point(where, false);
192 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
201 mkdir_p(where, 0755);
203 if (mount(mount_table[k].what,
206 mount_table[k].flags,
207 mount_table[k].options) < 0 &&
208 mount_table[k].fatal) {
210 log_error("mount(%s) failed: %m", where);
222 static int setup_timezone(const char *dest) {
227 /* Fix the timezone, if possible */
228 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
229 log_error("Out of memory");
233 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
234 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
238 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
239 log_error("Out of memory");
243 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
244 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
251 static int copy_devnodes(const char *dest) {
253 static const char devnodes[] =
271 NULSTR_FOREACH(d, devnodes) {
273 char *from = NULL, *to = NULL;
275 asprintf(&from, "/dev/%s", d);
276 asprintf(&to, "%s/dev/%s", dest, d);
279 log_error("Failed to allocate devnode path");
292 if (stat(from, &st) < 0) {
294 if (errno != ENOENT) {
295 log_error("Failed to stat %s: %m", from);
300 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
302 log_error("%s is not a char or block device, cannot copy.", from);
306 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
308 log_error("mknod(%s) failed: %m", dest);
322 static int setup_dev_console(const char *dest, const char *console) {
333 if (stat(console, &st) < 0) {
334 log_error("Failed to stat %s: %m", console);
338 } else if (!S_ISCHR(st.st_mode)) {
339 log_error("/dev/console is not a char device.");
344 r = chmod_and_chown(console, 0600, 0, 0);
346 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
350 if (asprintf(&to, "%s/dev/console", dest) < 0) {
351 log_error("Out of memory");
356 /* We need to bind mount the right tty to /dev/console since
357 * ptys can only exist on pts file systems. To have something
358 * to bind mount things on we create a device node first, that
359 * has the right major/minor (note that the major minor
360 * doesn't actually matter here, since we mount it over
363 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
364 log_error("mknod() for /dev/console failed: %m");
369 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
370 log_error("Bind mount for /dev/console failed: %m");
382 static int setup_kmsg(const char *dest, int kmsg_socket) {
383 char *from = NULL, *to = NULL;
387 struct cmsghdr cmsghdr;
388 uint8_t buf[CMSG_SPACE(sizeof(int))];
391 struct cmsghdr *cmsg;
394 assert(kmsg_socket >= 0);
398 /* We create the kmsg FIFO as /dev/kmsg, but immediately
399 * delete it after bind mounting it to /proc/kmsg. While FIFOs
400 * on the reading side behave very similar to /proc/kmsg,
401 * their writing side behaves differently from /dev/kmsg in
402 * that writing blocks when nothing is reading. In order to
403 * avoid any problems with containers deadlocking due to this
404 * we simply make /dev/kmsg unavailable to the container. */
405 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
406 log_error("Out of memory");
411 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
412 log_error("Out of memory");
417 if (mkfifo(from, 0600) < 0) {
418 log_error("mkfifo() for /dev/kmsg failed: %m");
423 r = chmod_and_chown(from, 0600, 0, 0);
425 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
429 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
430 log_error("Bind mount for /proc/kmsg failed: %m");
435 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
437 log_error("Failed to open fifo: %m");
445 mh.msg_control = &control;
446 mh.msg_controllen = sizeof(control);
448 cmsg = CMSG_FIRSTHDR(&mh);
449 cmsg->cmsg_level = SOL_SOCKET;
450 cmsg->cmsg_type = SCM_RIGHTS;
451 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
452 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
454 mh.msg_controllen = cmsg->cmsg_len;
456 /* Store away the fd in the socket, so that it stays open as
457 * long as we run the child */
458 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
459 close_nointr_nofail(fd);
462 log_error("Failed to send FIFO fd: %m");
467 /* And now make the FIFO unavailable as /dev/kmsg... */
478 static int setup_hostname(void) {
482 hn = file_name_from_path(arg_directory);
488 hostname_cleanup(hn);
491 if (sethostname(hn, strlen(hn)) < 0)
500 static int drop_capabilities(void) {
501 static const unsigned long retain[] = {
511 CAP_NET_BIND_SERVICE,
527 for (l = 0; l <= cap_last_cap(); l++) {
530 for (i = 0; i < ELEMENTSOF(retain); i++)
534 if (i < ELEMENTSOF(retain))
537 if (prctl(PR_CAPBSET_DROP, l) < 0) {
538 log_error("PR_CAPBSET_DROP failed: %m");
546 static int is_os_tree(const char *path) {
549 /* We use /bin/sh as flag file if something is an OS */
551 if (asprintf(&p, "%s/bin/sh", path) < 0)
557 return r < 0 ? 0 : 1;
560 static int process_pty(int master, sigset_t *mask) {
562 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
563 size_t in_buffer_full = 0, out_buffer_full = 0;
564 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
565 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
566 int ep = -1, signal_fd = -1, r;
568 fd_nonblock(STDIN_FILENO, 1);
569 fd_nonblock(STDOUT_FILENO, 1);
570 fd_nonblock(master, 1);
572 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
573 log_error("signalfd(): %m");
578 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
579 log_error("Failed to create epoll: %m");
585 stdin_ev.events = EPOLLIN|EPOLLET;
586 stdin_ev.data.fd = STDIN_FILENO;
589 stdout_ev.events = EPOLLOUT|EPOLLET;
590 stdout_ev.data.fd = STDOUT_FILENO;
593 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
594 master_ev.data.fd = master;
597 signal_ev.events = EPOLLIN;
598 signal_ev.data.fd = signal_fd;
600 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
601 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
602 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
603 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
604 log_error("Failed to regiser fds in epoll: %m");
610 struct epoll_event ev[16];
614 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
616 if (errno == EINTR || errno == EAGAIN)
619 log_error("epoll_wait(): %m");
626 for (i = 0; i < nfds; i++) {
627 if (ev[i].data.fd == STDIN_FILENO) {
629 if (ev[i].events & (EPOLLIN|EPOLLHUP))
630 stdin_readable = true;
632 } else if (ev[i].data.fd == STDOUT_FILENO) {
634 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
635 stdout_writable = true;
637 } else if (ev[i].data.fd == master) {
639 if (ev[i].events & (EPOLLIN|EPOLLHUP))
640 master_readable = true;
642 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
643 master_writable = true;
645 } else if (ev[i].data.fd == signal_fd) {
646 struct signalfd_siginfo sfsi;
649 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
652 log_error("Failed to read from signalfd: invalid block size");
657 if (errno != EINTR && errno != EAGAIN) {
658 log_error("Failed to read from signalfd: %m");
664 if (sfsi.ssi_signo == SIGWINCH) {
667 /* The window size changed, let's forward that. */
668 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
669 ioctl(master, TIOCSWINSZ, &ws);
678 while ((stdin_readable && in_buffer_full <= 0) ||
679 (master_writable && in_buffer_full > 0) ||
680 (master_readable && out_buffer_full <= 0) ||
681 (stdout_writable && out_buffer_full > 0)) {
683 if (stdin_readable && in_buffer_full < LINE_MAX) {
685 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
687 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
688 stdin_readable = false;
690 log_error("read(): %m");
695 in_buffer_full += (size_t) k;
698 if (master_writable && in_buffer_full > 0) {
700 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
702 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
703 master_writable = false;
705 log_error("write(): %m");
711 assert(in_buffer_full >= (size_t) k);
712 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
717 if (master_readable && out_buffer_full < LINE_MAX) {
719 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
721 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
722 master_readable = false;
724 log_error("read(): %m");
729 out_buffer_full += (size_t) k;
732 if (stdout_writable && out_buffer_full > 0) {
734 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
736 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
737 stdout_writable = false;
739 log_error("write(): %m");
745 assert(out_buffer_full >= (size_t) k);
746 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
747 out_buffer_full -= k;
755 close_nointr_nofail(ep);
758 close_nointr_nofail(signal_fd);
763 int main(int argc, char *argv[]) {
765 int r = EXIT_FAILURE, k;
766 char *oldcg = NULL, *newcg = NULL;
767 char **controller = NULL;
769 const char *console = NULL;
770 struct termios saved_attr, raw_attr;
772 bool saved_attr_valid = false;
774 int kmsg_socket_pair[2] = { -1, -1 };
776 log_parse_environment();
779 if ((r = parse_argv(argc, argv)) <= 0)
785 p = path_make_absolute_cwd(arg_directory);
789 arg_directory = get_current_dir_name();
791 if (!arg_directory) {
792 log_error("Failed to determine path");
796 path_kill_slashes(arg_directory);
798 if (geteuid() != 0) {
799 log_error("Need to be root.");
803 if (sd_booted() <= 0) {
804 log_error("Not running on a systemd system.");
808 if (path_equal(arg_directory, "/")) {
809 log_error("Spawning container on root directory not supported.");
813 if (is_os_tree(arg_directory) <= 0) {
814 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
818 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
819 log_error("Failed to determine current cgroup: %s", strerror(-k));
823 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
824 log_error("Failed to allocate cgroup path.");
828 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
830 log_error("Failed to create cgroup: %s", strerror(-k));
834 STRV_FOREACH(controller,arg_controllers) {
835 k = cg_create_and_attach(*controller, newcg, 0);
837 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
840 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
841 log_error("Failed to acquire pseudo tty: %m");
845 if (!(console = ptsname(master))) {
846 log_error("Failed to determine tty name: %m");
850 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
852 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
853 ioctl(master, TIOCSWINSZ, &ws);
855 if (unlockpt(master) < 0) {
856 log_error("Failed to unlock tty: %m");
860 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
861 log_error("Failed to get terminal attributes: %m");
865 saved_attr_valid = true;
867 raw_attr = saved_attr;
868 cfmakeraw(&raw_attr);
869 raw_attr.c_lflag &= ~ECHO;
871 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
872 log_error("Failed to set terminal attributes: %m");
876 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
877 log_error("Failed to create kmsg socket pair");
881 assert_se(sigemptyset(&mask) == 0);
882 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
883 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
885 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
888 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
890 log_error("clone() failed: %m");
898 const char *home = NULL;
899 uid_t uid = (uid_t) -1;
900 gid_t gid = (gid_t) -1;
901 const char *envp[] = {
902 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
903 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
911 envp[2] = strv_find_prefix(environ, "TERM=");
913 close_nointr_nofail(master);
915 close_nointr(STDIN_FILENO);
916 close_nointr(STDOUT_FILENO);
917 close_nointr(STDERR_FILENO);
919 close_all_fds(&kmsg_socket_pair[1], 1);
921 reset_all_signal_handlers();
923 assert_se(sigemptyset(&mask) == 0);
924 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
929 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
932 /* Mark / as private, in case somebody marked it shared */
933 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
936 if (mount_all(arg_directory) < 0)
939 if (copy_devnodes(arg_directory) < 0)
942 if (setup_dev_console(arg_directory, console) < 0)
945 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
948 close_nointr_nofail(kmsg_socket_pair[1]);
950 if (setup_timezone(arg_directory) < 0)
953 if (chdir(arg_directory) < 0) {
954 log_error("chdir(%s) failed: %m", arg_directory);
958 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
959 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
960 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
963 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
964 log_error("mount(MS_MOVE) failed: %m");
968 if (chroot(".") < 0) {
969 log_error("chroot() failed: %m");
973 if (chdir("/") < 0) {
974 log_error("chdir() failed: %m");
982 if (drop_capabilities() < 0)
987 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
988 log_error("get_user_creds() failed: %m");
992 if (mkdir_parents(home, 0775) < 0) {
993 log_error("mkdir_parents() failed: %m");
997 if (safe_mkdir(home, 0775, uid, gid) < 0) {
998 log_error("safe_mkdir() failed: %m");
1002 if (initgroups((const char*)arg_user, gid) < 0) {
1003 log_error("initgroups() failed: %m");
1007 if (setresgid(gid, gid, gid) < 0) {
1008 log_error("setregid() failed: %m");
1012 if (setresuid(uid, uid, uid) < 0) {
1013 log_error("setreuid() failed: %m");
1018 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
1019 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
1020 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
1021 log_error("Out of memory");
1028 execvpe(argv[optind], argv + optind, (char**) envp);
1030 chdir(home ? home : "/root");
1031 execle("/bin/bash", "-bash", NULL, (char**) envp);
1034 log_error("execv() failed: %m");
1037 _exit(EXIT_FAILURE);
1040 if (process_pty(master, &mask) < 0)
1043 if (saved_attr_valid) {
1044 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1045 saved_attr_valid = false;
1048 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1054 if (saved_attr_valid)
1055 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1058 close_nointr_nofail(master);
1060 close_pipe(kmsg_socket_pair);
1063 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1066 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1068 free(arg_directory);
1069 strv_free(arg_controllers);