1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <systemd/sd-daemon.h>
49 #include "cgroup-util.h"
51 #include "loopback-setup.h"
53 static char *arg_directory = NULL;
54 static char *arg_user = NULL;
55 static char **arg_controllers = NULL;
56 static bool arg_private_network = false;
58 static int help(void) {
60 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
61 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
62 " -h --help Show this help\n"
63 " -D --directory=NAME Root directory for the container\n"
64 " -u --user=USER Run the command under specified user or uid\n"
65 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
66 " --private-network Disable network in container\n",
67 program_invocation_short_name);
72 static int parse_argv(int argc, char *argv[]) {
75 ARG_PRIVATE_NETWORK = 0x100
78 static const struct option options[] = {
79 { "help", no_argument, NULL, 'h' },
80 { "directory", required_argument, NULL, 'D' },
81 { "user", required_argument, NULL, 'u' },
82 { "controllers", required_argument, NULL, 'C' },
83 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
92 while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
102 if (!(arg_directory = strdup(optarg))) {
103 log_error("Failed to duplicate root directory.");
111 if (!(arg_user = strdup(optarg))) {
112 log_error("Failed to duplicate user name.");
119 strv_free(arg_controllers);
120 arg_controllers = strv_split(optarg, ",");
121 if (!arg_controllers) {
122 log_error("Failed to split controllers list.");
125 strv_uniq(arg_controllers);
129 case ARG_PRIVATE_NETWORK:
130 arg_private_network = true;
137 log_error("Unknown option code %c", c);
145 static int mount_all(const char *dest) {
147 typedef struct MountPoint {
156 static const MountPoint mount_table[] = {
157 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
158 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
159 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
160 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
161 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
162 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
163 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
164 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
166 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
167 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
175 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
178 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
179 log_error("Out of memory");
187 t = path_is_mount_point(where, false);
189 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
198 mkdir_p(where, 0755);
200 if (mount(mount_table[k].what,
203 mount_table[k].flags,
204 mount_table[k].options) < 0 &&
205 mount_table[k].fatal) {
207 log_error("mount(%s) failed: %m", where);
216 /* Fix the timezone, if possible */
217 if (asprintf(&where, "%s/etc/localtime", dest) >= 0) {
219 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
220 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
225 if (asprintf(&where, "%s/etc/timezone", dest) >= 0) {
227 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
228 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
233 if (asprintf(&where, "%s/proc/kmsg", dest) >= 0) {
234 mount("/dev/null", where, "bind", MS_BIND, NULL);
241 static int copy_devnodes(const char *dest, const char *console) {
243 static const char devnodes[] =
258 char *from = NULL, *to = NULL;
265 NULSTR_FOREACH(d, devnodes) {
268 asprintf(&from, "/dev/%s", d);
269 asprintf(&to, "%s/dev/%s", dest, d);
272 log_error("Failed to allocate devnode path");
285 if (stat(from, &st) < 0) {
287 if (errno != ENOENT) {
288 log_error("Failed to stat %s: %m", from);
293 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
295 log_error("%s is not a char or block device, cannot copy.", from);
299 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
301 log_error("mknod(%s) failed: %m", dest);
310 if (stat(console, &st) < 0) {
312 log_error("Failed to stat %s: %m", console);
318 } else if (!S_ISCHR(st.st_mode)) {
320 log_error("/dev/console is not a char device.");
327 if (asprintf(&to, "%s/dev/console", dest) < 0) {
329 log_error("Out of memory");
336 /* We need to bind mount the right tty to /dev/console since
337 * ptys can only exist on pts file systems. To have something
338 * to bind mount things on we create a device node first, that
339 * has the right major/minor (note that the major minor
340 * doesn't actually matter here, since we mount it over
343 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
344 log_error("mknod for /dev/console failed: %m");
346 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
347 log_error("bind mount for /dev/console failed: %m");
355 if ((k = chmod_and_chown(console, 0600, 0, 0)) < 0) {
356 log_error("Failed to correct access mode for TTY: %s", strerror(-k));
368 static int drop_capabilities(void) {
369 static const unsigned long retain[] = {
379 CAP_NET_BIND_SERVICE,
395 for (l = 0; l <= cap_last_cap(); l++) {
398 for (i = 0; i < ELEMENTSOF(retain); i++)
402 if (i < ELEMENTSOF(retain))
405 if (prctl(PR_CAPBSET_DROP, l) < 0) {
406 log_error("PR_CAPBSET_DROP failed: %m");
414 static int is_os_tree(const char *path) {
417 /* We use /bin/sh as flag file if something is an OS */
419 if (asprintf(&p, "%s/bin/sh", path) < 0)
425 return r < 0 ? 0 : 1;
428 static int process_pty(int master, sigset_t *mask) {
430 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
431 size_t in_buffer_full = 0, out_buffer_full = 0;
432 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
433 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
434 int ep = -1, signal_fd = -1, r;
436 fd_nonblock(STDIN_FILENO, 1);
437 fd_nonblock(STDOUT_FILENO, 1);
438 fd_nonblock(master, 1);
440 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
441 log_error("signalfd(): %m");
446 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
447 log_error("Failed to create epoll: %m");
453 stdin_ev.events = EPOLLIN|EPOLLET;
454 stdin_ev.data.fd = STDIN_FILENO;
457 stdout_ev.events = EPOLLOUT|EPOLLET;
458 stdout_ev.data.fd = STDOUT_FILENO;
461 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
462 master_ev.data.fd = master;
465 signal_ev.events = EPOLLIN;
466 signal_ev.data.fd = signal_fd;
468 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
469 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
470 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
471 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
472 log_error("Failed to regiser fds in epoll: %m");
478 struct epoll_event ev[16];
482 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
484 if (errno == EINTR || errno == EAGAIN)
487 log_error("epoll_wait(): %m");
494 for (i = 0; i < nfds; i++) {
495 if (ev[i].data.fd == STDIN_FILENO) {
497 if (ev[i].events & (EPOLLIN|EPOLLHUP))
498 stdin_readable = true;
500 } else if (ev[i].data.fd == STDOUT_FILENO) {
502 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
503 stdout_writable = true;
505 } else if (ev[i].data.fd == master) {
507 if (ev[i].events & (EPOLLIN|EPOLLHUP))
508 master_readable = true;
510 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
511 master_writable = true;
513 } else if (ev[i].data.fd == signal_fd) {
514 struct signalfd_siginfo sfsi;
517 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
520 log_error("Failed to read from signalfd: invalid block size");
525 if (errno != EINTR && errno != EAGAIN) {
526 log_error("Failed to read from signalfd: %m");
532 if (sfsi.ssi_signo == SIGWINCH) {
535 /* The window size changed, let's forward that. */
536 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
537 ioctl(master, TIOCSWINSZ, &ws);
546 while ((stdin_readable && in_buffer_full <= 0) ||
547 (master_writable && in_buffer_full > 0) ||
548 (master_readable && out_buffer_full <= 0) ||
549 (stdout_writable && out_buffer_full > 0)) {
551 if (stdin_readable && in_buffer_full < LINE_MAX) {
553 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
555 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
556 stdin_readable = false;
558 log_error("read(): %m");
563 in_buffer_full += (size_t) k;
566 if (master_writable && in_buffer_full > 0) {
568 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
570 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
571 master_writable = false;
573 log_error("write(): %m");
579 assert(in_buffer_full >= (size_t) k);
580 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
585 if (master_readable && out_buffer_full < LINE_MAX) {
587 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
589 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
590 master_readable = false;
592 log_error("read(): %m");
597 out_buffer_full += (size_t) k;
600 if (stdout_writable && out_buffer_full > 0) {
602 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
604 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
605 stdout_writable = false;
607 log_error("write(): %m");
613 assert(out_buffer_full >= (size_t) k);
614 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
615 out_buffer_full -= k;
623 close_nointr_nofail(ep);
626 close_nointr_nofail(signal_fd);
631 int main(int argc, char *argv[]) {
633 int r = EXIT_FAILURE, k;
634 char *oldcg = NULL, *newcg = NULL;
635 char **controller = NULL;
637 const char *console = NULL;
638 struct termios saved_attr, raw_attr;
640 bool saved_attr_valid = false;
643 log_parse_environment();
646 if ((r = parse_argv(argc, argv)) <= 0)
652 p = path_make_absolute_cwd(arg_directory);
656 arg_directory = get_current_dir_name();
658 if (!arg_directory) {
659 log_error("Failed to determine path");
663 path_kill_slashes(arg_directory);
665 if (geteuid() != 0) {
666 log_error("Need to be root.");
670 if (sd_booted() <= 0) {
671 log_error("Not running on a systemd system.");
675 if (path_equal(arg_directory, "/")) {
676 log_error("Spawning container on root directory not supported.");
680 if (is_os_tree(arg_directory) <= 0) {
681 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
685 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
686 log_error("Failed to determine current cgroup: %s", strerror(-k));
690 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
691 log_error("Failed to allocate cgroup path.");
695 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
697 log_error("Failed to create cgroup: %s", strerror(-k));
701 STRV_FOREACH(controller,arg_controllers) {
702 k = cg_create_and_attach(*controller, newcg, 0);
704 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
707 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
708 log_error("Failed to acquire pseudo tty: %m");
712 if (!(console = ptsname(master))) {
713 log_error("Failed to determine tty name: %m");
717 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
719 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
720 ioctl(master, TIOCSWINSZ, &ws);
722 if (unlockpt(master) < 0) {
723 log_error("Failed to unlock tty: %m");
727 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
728 log_error("Failed to get terminal attributes: %m");
732 saved_attr_valid = true;
734 raw_attr = saved_attr;
735 cfmakeraw(&raw_attr);
736 raw_attr.c_lflag &= ~ECHO;
738 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
739 log_error("Failed to set terminal attributes: %m");
743 assert_se(sigemptyset(&mask) == 0);
744 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
745 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
747 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
750 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
752 log_error("clone() failed: %m");
761 const char *home = NULL;
762 uid_t uid = (uid_t) -1;
763 gid_t gid = (gid_t) -1;
764 const char *envp[] = {
765 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
766 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
774 envp[2] = strv_find_prefix(environ, "TERM=");
776 close_nointr_nofail(master);
778 close_nointr(STDIN_FILENO);
779 close_nointr(STDOUT_FILENO);
780 close_nointr(STDERR_FILENO);
782 close_all_fds(NULL, 0);
784 reset_all_signal_handlers();
786 assert_se(sigemptyset(&mask) == 0);
787 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
792 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
795 /* Mark / as private, in case somebody marked it shared */
796 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
799 if (mount_all(arg_directory) < 0)
802 if (copy_devnodes(arg_directory, console) < 0)
805 if (chdir(arg_directory) < 0) {
806 log_error("chdir(%s) failed: %m", arg_directory);
810 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
811 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
812 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
815 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
816 log_error("mount(MS_MOVE) failed: %m");
820 if (chroot(".") < 0) {
821 log_error("chroot() failed: %m");
825 if (chdir("/") < 0) {
826 log_error("chdir() failed: %m");
834 if (drop_capabilities() < 0)
839 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
840 log_error("get_user_creds() failed: %m");
844 if (mkdir_parents(home, 0775) < 0) {
845 log_error("mkdir_parents() failed: %m");
849 if (safe_mkdir(home, 0775, uid, gid) < 0) {
850 log_error("safe_mkdir() failed: %m");
854 if (initgroups((const char*)arg_user, gid) < 0) {
855 log_error("initgroups() failed: %m");
859 if (setresgid(gid, gid, gid) < 0) {
860 log_error("setregid() failed: %m");
864 if (setresuid(uid, uid, uid) < 0) {
865 log_error("setreuid() failed: %m");
870 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
871 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
872 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
873 log_error("Out of memory");
877 if ((hn = file_name_from_path(arg_directory)))
878 sethostname(hn, strlen(hn));
881 execvpe(argv[optind], argv + optind, (char**) envp);
883 chdir(home ? home : "/root");
884 execle("/bin/bash", "-bash", NULL, (char**) envp);
887 log_error("execv() failed: %m");
893 if (process_pty(master, &mask) < 0)
896 if (saved_attr_valid) {
897 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
898 saved_attr_valid = false;
901 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
907 if (saved_attr_valid)
908 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
911 close_nointr_nofail(master);
914 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
917 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
920 strv_free(arg_controllers);