1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <systemd/sd-daemon.h>
49 #include "cgroup-util.h"
51 #include "loopback-setup.h"
53 static char *arg_directory = NULL;
54 static char *arg_user = NULL;
55 static char **arg_controllers = NULL;
56 static bool arg_private_network = false;
58 static int help(void) {
60 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
61 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
62 " -h --help Show this help\n"
63 " -D --directory=NAME Root directory for the container\n"
64 " -u --user=USER Run the command under specified user or uid\n"
65 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
66 " --private-network Disable network in container\n",
67 program_invocation_short_name);
72 static int parse_argv(int argc, char *argv[]) {
75 ARG_PRIVATE_NETWORK = 0x100
78 static const struct option options[] = {
79 { "help", no_argument, NULL, 'h' },
80 { "directory", required_argument, NULL, 'D' },
81 { "user", required_argument, NULL, 'u' },
82 { "controllers", required_argument, NULL, 'C' },
83 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
92 while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
102 if (!(arg_directory = strdup(optarg))) {
103 log_error("Failed to duplicate root directory.");
111 if (!(arg_user = strdup(optarg))) {
112 log_error("Failed to duplicate user name.");
119 strv_free(arg_controllers);
120 arg_controllers = strv_split(optarg, ",");
121 if (!arg_controllers) {
122 log_error("Failed to split controllers list.");
125 strv_uniq(arg_controllers);
129 case ARG_PRIVATE_NETWORK:
130 arg_private_network = true;
137 log_error("Unknown option code %c", c);
145 static int mount_all(const char *dest) {
147 typedef struct MountPoint {
156 static const MountPoint mount_table[] = {
157 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
158 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
159 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
160 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
161 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
162 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
163 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
164 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
166 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
167 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
175 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
178 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
179 log_error("Out of memory");
187 t = path_is_mount_point(where, false);
189 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
198 mkdir_p(where, 0755);
200 if (mount(mount_table[k].what,
203 mount_table[k].flags,
204 mount_table[k].options) < 0 &&
205 mount_table[k].fatal) {
207 log_error("mount(%s) failed: %m", where);
219 static int setup_timezone(const char *dest) {
224 /* Fix the timezone, if possible */
225 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
226 log_error("Out of memory");
230 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
231 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
235 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
236 log_error("Out of memory");
240 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
241 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
248 static int copy_devnodes(const char *dest) {
250 static const char devnodes[] =
268 NULSTR_FOREACH(d, devnodes) {
270 char *from = NULL, *to = NULL;
272 asprintf(&from, "/dev/%s", d);
273 asprintf(&to, "%s/dev/%s", dest, d);
276 log_error("Failed to allocate devnode path");
289 if (stat(from, &st) < 0) {
291 if (errno != ENOENT) {
292 log_error("Failed to stat %s: %m", from);
297 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
299 log_error("%s is not a char or block device, cannot copy.", from);
303 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
305 log_error("mknod(%s) failed: %m", dest);
319 static int setup_dev_console(const char *dest, const char *console) {
330 if (stat(console, &st) < 0) {
331 log_error("Failed to stat %s: %m", console);
335 } else if (!S_ISCHR(st.st_mode)) {
336 log_error("/dev/console is not a char device.");
341 r = chmod_and_chown(console, 0600, 0, 0);
343 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
347 if (asprintf(&to, "%s/dev/console", dest) < 0) {
348 log_error("Out of memory");
353 /* We need to bind mount the right tty to /dev/console since
354 * ptys can only exist on pts file systems. To have something
355 * to bind mount things on we create a device node first, that
356 * has the right major/minor (note that the major minor
357 * doesn't actually matter here, since we mount it over
360 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
361 log_error("mknod() for /dev/console failed: %m");
366 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
367 log_error("Bind mount for /dev/console failed: %m");
379 static int setup_kmsg(const char *dest, int kmsg_socket) {
380 char *from = NULL, *to = NULL;
384 struct cmsghdr cmsghdr;
385 uint8_t buf[CMSG_SPACE(sizeof(int))];
388 struct cmsghdr *cmsg;
391 assert(kmsg_socket >= 0);
395 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
396 log_error("Out of memory");
401 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
402 log_error("Out of memory");
407 if (mkfifo(from, 0600) < 0) {
408 log_error("mkfifo() for /dev/kmsg failed: %m");
413 r = chmod_and_chown(from, 0600, 0, 0);
415 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
419 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
420 log_error("Bind mount for /proc/kmsg failed: %m");
425 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
427 log_error("Failed to open fifo: %m");
435 mh.msg_control = &control;
436 mh.msg_controllen = sizeof(control);
438 cmsg = CMSG_FIRSTHDR(&mh);
439 cmsg->cmsg_level = SOL_SOCKET;
440 cmsg->cmsg_type = SCM_RIGHTS;
441 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
442 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
444 mh.msg_controllen = cmsg->cmsg_len;
446 /* Store away the fd in the socket, so that it stays open as
447 * long as we run the child */
448 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
449 close_nointr_nofail(fd);
452 log_error("Failed to send FIFO fd: %m");
465 static int drop_capabilities(void) {
466 static const unsigned long retain[] = {
476 CAP_NET_BIND_SERVICE,
492 for (l = 0; l <= cap_last_cap(); l++) {
495 for (i = 0; i < ELEMENTSOF(retain); i++)
499 if (i < ELEMENTSOF(retain))
502 if (prctl(PR_CAPBSET_DROP, l) < 0) {
503 log_error("PR_CAPBSET_DROP failed: %m");
511 static int is_os_tree(const char *path) {
514 /* We use /bin/sh as flag file if something is an OS */
516 if (asprintf(&p, "%s/bin/sh", path) < 0)
522 return r < 0 ? 0 : 1;
525 static int process_pty(int master, sigset_t *mask) {
527 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
528 size_t in_buffer_full = 0, out_buffer_full = 0;
529 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
530 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
531 int ep = -1, signal_fd = -1, r;
533 fd_nonblock(STDIN_FILENO, 1);
534 fd_nonblock(STDOUT_FILENO, 1);
535 fd_nonblock(master, 1);
537 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
538 log_error("signalfd(): %m");
543 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
544 log_error("Failed to create epoll: %m");
550 stdin_ev.events = EPOLLIN|EPOLLET;
551 stdin_ev.data.fd = STDIN_FILENO;
554 stdout_ev.events = EPOLLOUT|EPOLLET;
555 stdout_ev.data.fd = STDOUT_FILENO;
558 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
559 master_ev.data.fd = master;
562 signal_ev.events = EPOLLIN;
563 signal_ev.data.fd = signal_fd;
565 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
566 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
567 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
568 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
569 log_error("Failed to regiser fds in epoll: %m");
575 struct epoll_event ev[16];
579 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
581 if (errno == EINTR || errno == EAGAIN)
584 log_error("epoll_wait(): %m");
591 for (i = 0; i < nfds; i++) {
592 if (ev[i].data.fd == STDIN_FILENO) {
594 if (ev[i].events & (EPOLLIN|EPOLLHUP))
595 stdin_readable = true;
597 } else if (ev[i].data.fd == STDOUT_FILENO) {
599 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
600 stdout_writable = true;
602 } else if (ev[i].data.fd == master) {
604 if (ev[i].events & (EPOLLIN|EPOLLHUP))
605 master_readable = true;
607 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
608 master_writable = true;
610 } else if (ev[i].data.fd == signal_fd) {
611 struct signalfd_siginfo sfsi;
614 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
617 log_error("Failed to read from signalfd: invalid block size");
622 if (errno != EINTR && errno != EAGAIN) {
623 log_error("Failed to read from signalfd: %m");
629 if (sfsi.ssi_signo == SIGWINCH) {
632 /* The window size changed, let's forward that. */
633 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
634 ioctl(master, TIOCSWINSZ, &ws);
643 while ((stdin_readable && in_buffer_full <= 0) ||
644 (master_writable && in_buffer_full > 0) ||
645 (master_readable && out_buffer_full <= 0) ||
646 (stdout_writable && out_buffer_full > 0)) {
648 if (stdin_readable && in_buffer_full < LINE_MAX) {
650 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
652 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
653 stdin_readable = false;
655 log_error("read(): %m");
660 in_buffer_full += (size_t) k;
663 if (master_writable && in_buffer_full > 0) {
665 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
667 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
668 master_writable = false;
670 log_error("write(): %m");
676 assert(in_buffer_full >= (size_t) k);
677 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
682 if (master_readable && out_buffer_full < LINE_MAX) {
684 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
686 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
687 master_readable = false;
689 log_error("read(): %m");
694 out_buffer_full += (size_t) k;
697 if (stdout_writable && out_buffer_full > 0) {
699 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
701 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
702 stdout_writable = false;
704 log_error("write(): %m");
710 assert(out_buffer_full >= (size_t) k);
711 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
712 out_buffer_full -= k;
720 close_nointr_nofail(ep);
723 close_nointr_nofail(signal_fd);
728 int main(int argc, char *argv[]) {
730 int r = EXIT_FAILURE, k;
731 char *oldcg = NULL, *newcg = NULL;
732 char **controller = NULL;
734 const char *console = NULL;
735 struct termios saved_attr, raw_attr;
737 bool saved_attr_valid = false;
739 int kmsg_socket_pair[2] = { -1, -1 };
741 log_parse_environment();
744 if ((r = parse_argv(argc, argv)) <= 0)
750 p = path_make_absolute_cwd(arg_directory);
754 arg_directory = get_current_dir_name();
756 if (!arg_directory) {
757 log_error("Failed to determine path");
761 path_kill_slashes(arg_directory);
763 if (geteuid() != 0) {
764 log_error("Need to be root.");
768 if (sd_booted() <= 0) {
769 log_error("Not running on a systemd system.");
773 if (path_equal(arg_directory, "/")) {
774 log_error("Spawning container on root directory not supported.");
778 if (is_os_tree(arg_directory) <= 0) {
779 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
783 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
784 log_error("Failed to determine current cgroup: %s", strerror(-k));
788 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
789 log_error("Failed to allocate cgroup path.");
793 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
795 log_error("Failed to create cgroup: %s", strerror(-k));
799 STRV_FOREACH(controller,arg_controllers) {
800 k = cg_create_and_attach(*controller, newcg, 0);
802 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
805 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
806 log_error("Failed to acquire pseudo tty: %m");
810 if (!(console = ptsname(master))) {
811 log_error("Failed to determine tty name: %m");
815 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
817 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
818 ioctl(master, TIOCSWINSZ, &ws);
820 if (unlockpt(master) < 0) {
821 log_error("Failed to unlock tty: %m");
825 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
826 log_error("Failed to get terminal attributes: %m");
830 saved_attr_valid = true;
832 raw_attr = saved_attr;
833 cfmakeraw(&raw_attr);
834 raw_attr.c_lflag &= ~ECHO;
836 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
837 log_error("Failed to set terminal attributes: %m");
841 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
842 log_error("Failed to create kmsg socket pair");
846 assert_se(sigemptyset(&mask) == 0);
847 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
848 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
850 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
853 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
855 log_error("clone() failed: %m");
864 const char *home = NULL;
865 uid_t uid = (uid_t) -1;
866 gid_t gid = (gid_t) -1;
867 const char *envp[] = {
868 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
869 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
877 envp[2] = strv_find_prefix(environ, "TERM=");
879 close_nointr_nofail(master);
881 close_nointr(STDIN_FILENO);
882 close_nointr(STDOUT_FILENO);
883 close_nointr(STDERR_FILENO);
885 close_all_fds(&kmsg_socket_pair[1], 1);
887 reset_all_signal_handlers();
889 assert_se(sigemptyset(&mask) == 0);
890 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
895 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
898 /* Mark / as private, in case somebody marked it shared */
899 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
902 if (mount_all(arg_directory) < 0)
905 if (copy_devnodes(arg_directory) < 0)
908 if (setup_dev_console(arg_directory, console) < 0)
911 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
914 close_nointr_nofail(kmsg_socket_pair[1]);
916 if (setup_timezone(arg_directory) < 0)
919 if (chdir(arg_directory) < 0) {
920 log_error("chdir(%s) failed: %m", arg_directory);
924 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
925 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
926 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
929 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
930 log_error("mount(MS_MOVE) failed: %m");
934 if (chroot(".") < 0) {
935 log_error("chroot() failed: %m");
939 if (chdir("/") < 0) {
940 log_error("chdir() failed: %m");
948 if (drop_capabilities() < 0)
953 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
954 log_error("get_user_creds() failed: %m");
958 if (mkdir_parents(home, 0775) < 0) {
959 log_error("mkdir_parents() failed: %m");
963 if (safe_mkdir(home, 0775, uid, gid) < 0) {
964 log_error("safe_mkdir() failed: %m");
968 if (initgroups((const char*)arg_user, gid) < 0) {
969 log_error("initgroups() failed: %m");
973 if (setresgid(gid, gid, gid) < 0) {
974 log_error("setregid() failed: %m");
978 if (setresuid(uid, uid, uid) < 0) {
979 log_error("setreuid() failed: %m");
984 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
985 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
986 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
987 log_error("Out of memory");
991 if ((hn = file_name_from_path(arg_directory)))
992 sethostname(hn, strlen(hn));
995 execvpe(argv[optind], argv + optind, (char**) envp);
997 chdir(home ? home : "/root");
998 execle("/bin/bash", "-bash", NULL, (char**) envp);
1001 log_error("execv() failed: %m");
1004 _exit(EXIT_FAILURE);
1007 if (process_pty(master, &mask) < 0)
1010 if (saved_attr_valid) {
1011 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1012 saved_attr_valid = false;
1015 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1021 if (saved_attr_valid)
1022 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1025 close_nointr_nofail(master);
1027 close_pipe(kmsg_socket_pair);
1030 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1033 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1035 free(arg_directory);
1036 strv_free(arg_controllers);