1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <systemd/sd-daemon.h>
49 #include "cgroup-util.h"
51 #include "loopback-setup.h"
53 static char *arg_directory = NULL;
54 static char *arg_user = NULL;
55 static bool arg_private_network = false;
57 static int help(void) {
59 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
60 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
61 " -h --help Show this help\n"
62 " -D --directory=NAME Root directory for the container\n"
63 " -u --user=USER Run the command under specified user or uid\n"
64 " --private-network Disable network in container\n",
65 program_invocation_short_name);
70 static int parse_argv(int argc, char *argv[]) {
73 ARG_PRIVATE_NETWORK = 0x100
76 static const struct option options[] = {
77 { "help", no_argument, NULL, 'h' },
78 { "directory", required_argument, NULL, 'D' },
79 { "user", required_argument, NULL, 'u' },
80 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
89 while ((c = getopt_long(argc, argv, "+hD:u:", options, NULL)) >= 0) {
99 if (!(arg_directory = strdup(optarg))) {
100 log_error("Failed to duplicate root directory.");
108 if (!(arg_user = strdup(optarg))) {
109 log_error("Failed to duplicate user name.");
115 case ARG_PRIVATE_NETWORK:
116 arg_private_network = true;
123 log_error("Unknown option code %c", c);
131 static int mount_all(const char *dest) {
133 typedef struct MountPoint {
142 static const MountPoint mount_table[] = {
143 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
144 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
145 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
146 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
147 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
148 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
149 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
150 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
152 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
153 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
161 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
164 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
165 log_error("Out of memory");
173 if ((t = path_is_mount_point(where, false)) < 0) {
174 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
183 mkdir_p(where, 0755);
185 if (mount(mount_table[k].what,
188 mount_table[k].flags,
189 mount_table[k].options) < 0 &&
190 mount_table[k].fatal) {
192 log_error("mount(%s) failed: %m", where);
201 /* Fix the timezone, if possible */
202 if (asprintf(&where, "%s/etc/localtime", dest) >= 0) {
204 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
205 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
210 if (asprintf(&where, "%s/etc/timezone", dest) >= 0) {
212 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
213 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
221 static int copy_devnodes(const char *dest, const char *console) {
223 static const char devnodes[] =
238 char *from = NULL, *to = NULL;
245 NULSTR_FOREACH(d, devnodes) {
248 asprintf(&from, "/dev/%s", d);
249 asprintf(&to, "%s/dev/%s", dest, d);
252 log_error("Failed to allocate devnode path");
265 if (stat(from, &st) < 0) {
267 if (errno != ENOENT) {
268 log_error("Failed to stat %s: %m", from);
273 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
275 log_error("%s is not a char or block device, cannot copy.", from);
279 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
281 log_error("mknod(%s) failed: %m", dest);
290 if (stat(console, &st) < 0) {
292 log_error("Failed to stat %s: %m", console);
298 } else if (!S_ISCHR(st.st_mode)) {
300 log_error("/dev/console is not a char device.");
307 if (asprintf(&to, "%s/dev/console", dest) < 0) {
309 log_error("Out of memory");
316 /* We need to bind mount the right tty to /dev/console since
317 * ptys can only exist on pts file systems. To have something
318 * to bind mount things on we create a device node first, that
319 * has the right major/minor (note that the major minor
320 * doesn't actually matter here, since we mount it over
323 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
324 log_error("mknod for /dev/console failed: %m");
326 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
327 log_error("bind mount for /dev/console failed: %m");
335 if ((k = chmod_and_chown(console, 0600, 0, 0)) < 0) {
336 log_error("Failed to correct access mode for TTY: %s", strerror(-k));
348 static int drop_capabilities(void) {
349 static const unsigned long retain[] = {
359 CAP_NET_BIND_SERVICE,
375 for (l = 0; l <= cap_last_cap(); l++) {
378 for (i = 0; i < ELEMENTSOF(retain); i++)
382 if (i < ELEMENTSOF(retain))
385 if (prctl(PR_CAPBSET_DROP, l) < 0) {
386 log_error("PR_CAPBSET_DROP failed: %m");
394 static int is_os_tree(const char *path) {
397 /* We use /bin/sh as flag file if something is an OS */
399 if (asprintf(&p, "%s/bin/sh", path) < 0)
405 return r < 0 ? 0 : 1;
408 static int process_pty(int master, sigset_t *mask) {
410 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
411 size_t in_buffer_full = 0, out_buffer_full = 0;
412 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
413 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
414 int ep = -1, signal_fd = -1, r;
416 fd_nonblock(STDIN_FILENO, 1);
417 fd_nonblock(STDOUT_FILENO, 1);
418 fd_nonblock(master, 1);
420 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
421 log_error("signalfd(): %m");
426 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
427 log_error("Failed to create epoll: %m");
433 stdin_ev.events = EPOLLIN|EPOLLET;
434 stdin_ev.data.fd = STDIN_FILENO;
437 stdout_ev.events = EPOLLOUT|EPOLLET;
438 stdout_ev.data.fd = STDOUT_FILENO;
441 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
442 master_ev.data.fd = master;
445 signal_ev.events = EPOLLIN;
446 signal_ev.data.fd = signal_fd;
448 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
449 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
450 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
451 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
452 log_error("Failed to regiser fds in epoll: %m");
458 struct epoll_event ev[16];
462 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
464 if (errno == EINTR || errno == EAGAIN)
467 log_error("epoll_wait(): %m");
474 for (i = 0; i < nfds; i++) {
475 if (ev[i].data.fd == STDIN_FILENO) {
477 if (ev[i].events & (EPOLLIN|EPOLLHUP))
478 stdin_readable = true;
480 } else if (ev[i].data.fd == STDOUT_FILENO) {
482 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
483 stdout_writable = true;
485 } else if (ev[i].data.fd == master) {
487 if (ev[i].events & (EPOLLIN|EPOLLHUP))
488 master_readable = true;
490 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
491 master_writable = true;
493 } else if (ev[i].data.fd == signal_fd) {
494 struct signalfd_siginfo sfsi;
497 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
500 log_error("Failed to read from signalfd: invalid block size");
505 if (errno != EINTR && errno != EAGAIN) {
506 log_error("Failed to read from signalfd: %m");
512 if (sfsi.ssi_signo == SIGWINCH) {
515 /* The window size changed, let's forward that. */
516 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
517 ioctl(master, TIOCSWINSZ, &ws);
526 while ((stdin_readable && in_buffer_full <= 0) ||
527 (master_writable && in_buffer_full > 0) ||
528 (master_readable && out_buffer_full <= 0) ||
529 (stdout_writable && out_buffer_full > 0)) {
531 if (stdin_readable && in_buffer_full < LINE_MAX) {
533 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
535 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
536 stdin_readable = false;
538 log_error("read(): %m");
543 in_buffer_full += (size_t) k;
546 if (master_writable && in_buffer_full > 0) {
548 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
550 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
551 master_writable = false;
553 log_error("write(): %m");
559 assert(in_buffer_full >= (size_t) k);
560 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
565 if (master_readable && out_buffer_full < LINE_MAX) {
567 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
569 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
570 master_readable = false;
572 log_error("read(): %m");
577 out_buffer_full += (size_t) k;
580 if (stdout_writable && out_buffer_full > 0) {
582 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
584 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
585 stdout_writable = false;
587 log_error("write(): %m");
593 assert(out_buffer_full >= (size_t) k);
594 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
595 out_buffer_full -= k;
603 close_nointr_nofail(ep);
606 close_nointr_nofail(signal_fd);
611 int main(int argc, char *argv[]) {
613 int r = EXIT_FAILURE, k;
614 char *oldcg = NULL, *newcg = NULL;
616 const char *console = NULL;
617 struct termios saved_attr, raw_attr;
619 bool saved_attr_valid = false;
622 log_parse_environment();
625 if ((r = parse_argv(argc, argv)) <= 0)
631 p = path_make_absolute_cwd(arg_directory);
635 arg_directory = get_current_dir_name();
637 if (!arg_directory) {
638 log_error("Failed to determine path");
642 path_kill_slashes(arg_directory);
644 if (geteuid() != 0) {
645 log_error("Need to be root.");
649 if (sd_booted() <= 0) {
650 log_error("Not running on a systemd system.");
654 if (path_equal(arg_directory, "/")) {
655 log_error("Spawning container on root directory not supported.");
659 if (is_os_tree(arg_directory) <= 0) {
660 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
664 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
665 log_error("Failed to determine current cgroup: %s", strerror(-k));
669 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
670 log_error("Failed to allocate cgroup path.");
674 if ((k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0)) < 0) {
675 log_error("Failed to create cgroup: %s", strerror(-k));
679 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
680 log_error("Failed to acquire pseudo tty: %m");
684 if (!(console = ptsname(master))) {
685 log_error("Failed to determine tty name: %m");
689 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
691 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
692 ioctl(master, TIOCSWINSZ, &ws);
694 if (unlockpt(master) < 0) {
695 log_error("Failed to unlock tty: %m");
699 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
700 log_error("Failed to get terminal attributes: %m");
704 saved_attr_valid = true;
706 raw_attr = saved_attr;
707 cfmakeraw(&raw_attr);
708 raw_attr.c_lflag &= ~ECHO;
710 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
711 log_error("Failed to set terminal attributes: %m");
715 assert_se(sigemptyset(&mask) == 0);
716 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
717 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
719 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
722 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
724 log_error("clone() failed: %m");
733 const char *home = NULL;
734 uid_t uid = (uid_t) -1;
735 gid_t gid = (gid_t) -1;
736 const char *envp[] = {
737 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
738 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
746 envp[2] = strv_find_prefix(environ, "TERM=");
748 close_nointr_nofail(master);
750 close_nointr(STDIN_FILENO);
751 close_nointr(STDOUT_FILENO);
752 close_nointr(STDERR_FILENO);
754 close_all_fds(NULL, 0);
756 reset_all_signal_handlers();
758 assert_se(sigemptyset(&mask) == 0);
759 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
764 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
767 /* Mark / as private, in case somebody marked it shared */
768 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
771 if (mount_all(arg_directory) < 0)
774 if (copy_devnodes(arg_directory, console) < 0)
777 if (chdir(arg_directory) < 0) {
778 log_error("chdir(%s) failed: %m", arg_directory);
782 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
783 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
784 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
787 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
788 log_error("mount(MS_MOVE) failed: %m");
792 if (chroot(".") < 0) {
793 log_error("chroot() failed: %m");
797 if (chdir("/") < 0) {
798 log_error("chdir() failed: %m");
806 if (drop_capabilities() < 0)
811 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
812 log_error("get_user_creds() failed: %m");
816 if (mkdir_parents(home, 0775) < 0) {
817 log_error("mkdir_parents() failed: %m");
821 if (safe_mkdir(home, 0775, uid, gid) < 0) {
822 log_error("safe_mkdir() failed: %m");
826 if (initgroups((const char*)arg_user, gid) < 0) {
827 log_error("initgroups() failed: %m");
831 if (setresgid(gid, gid, gid) < 0) {
832 log_error("setregid() failed: %m");
836 if (setresuid(uid, uid, uid) < 0) {
837 log_error("setreuid() failed: %m");
842 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
843 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
844 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
845 log_error("Out of memory");
849 if ((hn = file_name_from_path(arg_directory)))
850 sethostname(hn, strlen(hn));
853 execvpe(argv[optind], argv + optind, (char**) envp);
855 chdir(home ? home : "/root");
856 execle("/bin/bash", "-bash", NULL, (char**) envp);
859 log_error("execv() failed: %m");
865 if (process_pty(master, &mask) < 0)
868 if (saved_attr_valid) {
869 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
870 saved_attr_valid = false;
873 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
879 if (saved_attr_valid)
880 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
883 close_nointr_nofail(master);
886 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
889 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);