1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <systemd/sd-daemon.h>
49 #include "cgroup-util.h"
51 #include "loopback-setup.h"
53 static char *arg_directory = NULL;
54 static char *arg_user = NULL;
55 static char **arg_controllers = NULL;
56 static bool arg_private_network = false;
58 static int help(void) {
60 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
61 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
62 " -h --help Show this help\n"
63 " -D --directory=NAME Root directory for the container\n"
64 " -u --user=USER Run the command under specified user or uid\n"
65 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
66 " --private-network Disable network in container\n",
67 program_invocation_short_name);
72 static int parse_argv(int argc, char *argv[]) {
75 ARG_PRIVATE_NETWORK = 0x100
78 static const struct option options[] = {
79 { "help", no_argument, NULL, 'h' },
80 { "directory", required_argument, NULL, 'D' },
81 { "user", required_argument, NULL, 'u' },
82 { "controllers", required_argument, NULL, 'C' },
83 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
92 while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
102 if (!(arg_directory = strdup(optarg))) {
103 log_error("Failed to duplicate root directory.");
111 if (!(arg_user = strdup(optarg))) {
112 log_error("Failed to duplicate user name.");
119 strv_free(arg_controllers);
120 arg_controllers = strv_split(optarg, ",");
121 if (!arg_controllers) {
122 log_error("Failed to split controllers list.");
125 strv_uniq(arg_controllers);
129 case ARG_PRIVATE_NETWORK:
130 arg_private_network = true;
137 log_error("Unknown option code %c", c);
145 static int mount_all(const char *dest) {
147 typedef struct MountPoint {
156 static const MountPoint mount_table[] = {
157 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
158 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
159 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
160 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
161 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
162 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
163 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
164 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
166 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
167 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
175 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
178 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
179 log_error("Out of memory");
187 if ((t = path_is_mount_point(where, false)) < 0) {
188 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
197 mkdir_p(where, 0755);
199 if (mount(mount_table[k].what,
202 mount_table[k].flags,
203 mount_table[k].options) < 0 &&
204 mount_table[k].fatal) {
206 log_error("mount(%s) failed: %m", where);
215 /* Fix the timezone, if possible */
216 if (asprintf(&where, "%s/etc/localtime", dest) >= 0) {
218 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
219 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
224 if (asprintf(&where, "%s/etc/timezone", dest) >= 0) {
226 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
227 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
235 static int copy_devnodes(const char *dest, const char *console) {
237 static const char devnodes[] =
252 char *from = NULL, *to = NULL;
259 NULSTR_FOREACH(d, devnodes) {
262 asprintf(&from, "/dev/%s", d);
263 asprintf(&to, "%s/dev/%s", dest, d);
266 log_error("Failed to allocate devnode path");
279 if (stat(from, &st) < 0) {
281 if (errno != ENOENT) {
282 log_error("Failed to stat %s: %m", from);
287 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
289 log_error("%s is not a char or block device, cannot copy.", from);
293 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
295 log_error("mknod(%s) failed: %m", dest);
304 if (stat(console, &st) < 0) {
306 log_error("Failed to stat %s: %m", console);
312 } else if (!S_ISCHR(st.st_mode)) {
314 log_error("/dev/console is not a char device.");
321 if (asprintf(&to, "%s/dev/console", dest) < 0) {
323 log_error("Out of memory");
330 /* We need to bind mount the right tty to /dev/console since
331 * ptys can only exist on pts file systems. To have something
332 * to bind mount things on we create a device node first, that
333 * has the right major/minor (note that the major minor
334 * doesn't actually matter here, since we mount it over
337 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
338 log_error("mknod for /dev/console failed: %m");
340 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
341 log_error("bind mount for /dev/console failed: %m");
349 if ((k = chmod_and_chown(console, 0600, 0, 0)) < 0) {
350 log_error("Failed to correct access mode for TTY: %s", strerror(-k));
362 static int drop_capabilities(void) {
363 static const unsigned long retain[] = {
373 CAP_NET_BIND_SERVICE,
389 for (l = 0; l <= cap_last_cap(); l++) {
392 for (i = 0; i < ELEMENTSOF(retain); i++)
396 if (i < ELEMENTSOF(retain))
399 if (prctl(PR_CAPBSET_DROP, l) < 0) {
400 log_error("PR_CAPBSET_DROP failed: %m");
408 static int is_os_tree(const char *path) {
411 /* We use /bin/sh as flag file if something is an OS */
413 if (asprintf(&p, "%s/bin/sh", path) < 0)
419 return r < 0 ? 0 : 1;
422 static int process_pty(int master, sigset_t *mask) {
424 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
425 size_t in_buffer_full = 0, out_buffer_full = 0;
426 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
427 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
428 int ep = -1, signal_fd = -1, r;
430 fd_nonblock(STDIN_FILENO, 1);
431 fd_nonblock(STDOUT_FILENO, 1);
432 fd_nonblock(master, 1);
434 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
435 log_error("signalfd(): %m");
440 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
441 log_error("Failed to create epoll: %m");
447 stdin_ev.events = EPOLLIN|EPOLLET;
448 stdin_ev.data.fd = STDIN_FILENO;
451 stdout_ev.events = EPOLLOUT|EPOLLET;
452 stdout_ev.data.fd = STDOUT_FILENO;
455 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
456 master_ev.data.fd = master;
459 signal_ev.events = EPOLLIN;
460 signal_ev.data.fd = signal_fd;
462 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
463 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
464 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
465 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
466 log_error("Failed to regiser fds in epoll: %m");
472 struct epoll_event ev[16];
476 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
478 if (errno == EINTR || errno == EAGAIN)
481 log_error("epoll_wait(): %m");
488 for (i = 0; i < nfds; i++) {
489 if (ev[i].data.fd == STDIN_FILENO) {
491 if (ev[i].events & (EPOLLIN|EPOLLHUP))
492 stdin_readable = true;
494 } else if (ev[i].data.fd == STDOUT_FILENO) {
496 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
497 stdout_writable = true;
499 } else if (ev[i].data.fd == master) {
501 if (ev[i].events & (EPOLLIN|EPOLLHUP))
502 master_readable = true;
504 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
505 master_writable = true;
507 } else if (ev[i].data.fd == signal_fd) {
508 struct signalfd_siginfo sfsi;
511 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
514 log_error("Failed to read from signalfd: invalid block size");
519 if (errno != EINTR && errno != EAGAIN) {
520 log_error("Failed to read from signalfd: %m");
526 if (sfsi.ssi_signo == SIGWINCH) {
529 /* The window size changed, let's forward that. */
530 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
531 ioctl(master, TIOCSWINSZ, &ws);
540 while ((stdin_readable && in_buffer_full <= 0) ||
541 (master_writable && in_buffer_full > 0) ||
542 (master_readable && out_buffer_full <= 0) ||
543 (stdout_writable && out_buffer_full > 0)) {
545 if (stdin_readable && in_buffer_full < LINE_MAX) {
547 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
549 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
550 stdin_readable = false;
552 log_error("read(): %m");
557 in_buffer_full += (size_t) k;
560 if (master_writable && in_buffer_full > 0) {
562 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
564 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
565 master_writable = false;
567 log_error("write(): %m");
573 assert(in_buffer_full >= (size_t) k);
574 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
579 if (master_readable && out_buffer_full < LINE_MAX) {
581 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
583 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
584 master_readable = false;
586 log_error("read(): %m");
591 out_buffer_full += (size_t) k;
594 if (stdout_writable && out_buffer_full > 0) {
596 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
598 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
599 stdout_writable = false;
601 log_error("write(): %m");
607 assert(out_buffer_full >= (size_t) k);
608 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
609 out_buffer_full -= k;
617 close_nointr_nofail(ep);
620 close_nointr_nofail(signal_fd);
625 int main(int argc, char *argv[]) {
627 int r = EXIT_FAILURE, k;
628 char *oldcg = NULL, *newcg = NULL;
629 char **controller = NULL;
631 const char *console = NULL;
632 struct termios saved_attr, raw_attr;
634 bool saved_attr_valid = false;
637 log_parse_environment();
640 if ((r = parse_argv(argc, argv)) <= 0)
646 p = path_make_absolute_cwd(arg_directory);
650 arg_directory = get_current_dir_name();
652 if (!arg_directory) {
653 log_error("Failed to determine path");
657 path_kill_slashes(arg_directory);
659 if (geteuid() != 0) {
660 log_error("Need to be root.");
664 if (sd_booted() <= 0) {
665 log_error("Not running on a systemd system.");
669 if (path_equal(arg_directory, "/")) {
670 log_error("Spawning container on root directory not supported.");
674 if (is_os_tree(arg_directory) <= 0) {
675 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
679 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
680 log_error("Failed to determine current cgroup: %s", strerror(-k));
684 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
685 log_error("Failed to allocate cgroup path.");
689 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
691 log_error("Failed to create cgroup: %s", strerror(-k));
695 STRV_FOREACH(controller,arg_controllers) {
696 k = cg_create_and_attach(*controller, newcg, 0);
698 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
701 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
702 log_error("Failed to acquire pseudo tty: %m");
706 if (!(console = ptsname(master))) {
707 log_error("Failed to determine tty name: %m");
711 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
713 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
714 ioctl(master, TIOCSWINSZ, &ws);
716 if (unlockpt(master) < 0) {
717 log_error("Failed to unlock tty: %m");
721 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
722 log_error("Failed to get terminal attributes: %m");
726 saved_attr_valid = true;
728 raw_attr = saved_attr;
729 cfmakeraw(&raw_attr);
730 raw_attr.c_lflag &= ~ECHO;
732 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
733 log_error("Failed to set terminal attributes: %m");
737 assert_se(sigemptyset(&mask) == 0);
738 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
739 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
741 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
744 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
746 log_error("clone() failed: %m");
755 const char *home = NULL;
756 uid_t uid = (uid_t) -1;
757 gid_t gid = (gid_t) -1;
758 const char *envp[] = {
759 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
760 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
768 envp[2] = strv_find_prefix(environ, "TERM=");
770 close_nointr_nofail(master);
772 close_nointr(STDIN_FILENO);
773 close_nointr(STDOUT_FILENO);
774 close_nointr(STDERR_FILENO);
776 close_all_fds(NULL, 0);
778 reset_all_signal_handlers();
780 assert_se(sigemptyset(&mask) == 0);
781 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
786 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
789 /* Mark / as private, in case somebody marked it shared */
790 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
793 if (mount_all(arg_directory) < 0)
796 if (copy_devnodes(arg_directory, console) < 0)
799 if (chdir(arg_directory) < 0) {
800 log_error("chdir(%s) failed: %m", arg_directory);
804 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
805 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
806 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
809 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
810 log_error("mount(MS_MOVE) failed: %m");
814 if (chroot(".") < 0) {
815 log_error("chroot() failed: %m");
819 if (chdir("/") < 0) {
820 log_error("chdir() failed: %m");
828 if (drop_capabilities() < 0)
833 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
834 log_error("get_user_creds() failed: %m");
838 if (mkdir_parents(home, 0775) < 0) {
839 log_error("mkdir_parents() failed: %m");
843 if (safe_mkdir(home, 0775, uid, gid) < 0) {
844 log_error("safe_mkdir() failed: %m");
848 if (initgroups((const char*)arg_user, gid) < 0) {
849 log_error("initgroups() failed: %m");
853 if (setresgid(gid, gid, gid) < 0) {
854 log_error("setregid() failed: %m");
858 if (setresuid(uid, uid, uid) < 0) {
859 log_error("setreuid() failed: %m");
864 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
865 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
866 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
867 log_error("Out of memory");
871 if ((hn = file_name_from_path(arg_directory)))
872 sethostname(hn, strlen(hn));
875 execvpe(argv[optind], argv + optind, (char**) envp);
877 chdir(home ? home : "/root");
878 execle("/bin/bash", "-bash", NULL, (char**) envp);
881 log_error("execv() failed: %m");
887 if (process_pty(master, &mask) < 0)
890 if (saved_attr_valid) {
891 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
892 saved_attr_valid = false;
895 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
901 if (saved_attr_valid)
902 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
905 close_nointr_nofail(master);
908 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
911 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
914 strv_free(arg_controllers);