1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "loopback-setup.h"
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
60 static int help(void) {
62 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
63 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
64 " -h --help Show this help\n"
65 " -D --directory=NAME Root directory for the container\n"
66 " -u --user=USER Run the command under specified user or uid\n"
67 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
68 " --private-network Disable network in container\n",
69 program_invocation_short_name);
74 static int parse_argv(int argc, char *argv[]) {
77 ARG_PRIVATE_NETWORK = 0x100
80 static const struct option options[] = {
81 { "help", no_argument, NULL, 'h' },
82 { "directory", required_argument, NULL, 'D' },
83 { "user", required_argument, NULL, 'u' },
84 { "controllers", required_argument, NULL, 'C' },
85 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
94 while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
104 if (!(arg_directory = strdup(optarg))) {
105 log_error("Failed to duplicate root directory.");
113 if (!(arg_user = strdup(optarg))) {
114 log_error("Failed to duplicate user name.");
121 strv_free(arg_controllers);
122 arg_controllers = strv_split(optarg, ",");
123 if (!arg_controllers) {
124 log_error("Failed to split controllers list.");
127 strv_uniq(arg_controllers);
131 case ARG_PRIVATE_NETWORK:
132 arg_private_network = true;
139 log_error("Unknown option code %c", c);
147 static int mount_all(const char *dest) {
149 typedef struct MountPoint {
158 static const MountPoint mount_table[] = {
159 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
160 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
161 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
162 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
163 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
164 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
165 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
166 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
168 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
169 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
177 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
180 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
181 log_error("Out of memory");
189 t = path_is_mount_point(where, false);
191 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
200 mkdir_p(where, 0755);
202 if (mount(mount_table[k].what,
205 mount_table[k].flags,
206 mount_table[k].options) < 0 &&
207 mount_table[k].fatal) {
209 log_error("mount(%s) failed: %m", where);
221 static int setup_timezone(const char *dest) {
226 /* Fix the timezone, if possible */
227 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
228 log_error("Out of memory");
232 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
233 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
237 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
238 log_error("Out of memory");
242 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
243 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
250 static int copy_devnodes(const char *dest) {
252 static const char devnodes[] =
270 NULSTR_FOREACH(d, devnodes) {
272 char *from = NULL, *to = NULL;
274 asprintf(&from, "/dev/%s", d);
275 asprintf(&to, "%s/dev/%s", dest, d);
278 log_error("Failed to allocate devnode path");
291 if (stat(from, &st) < 0) {
293 if (errno != ENOENT) {
294 log_error("Failed to stat %s: %m", from);
299 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
301 log_error("%s is not a char or block device, cannot copy.", from);
305 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
307 log_error("mknod(%s) failed: %m", dest);
321 static int setup_dev_console(const char *dest, const char *console) {
332 if (stat(console, &st) < 0) {
333 log_error("Failed to stat %s: %m", console);
337 } else if (!S_ISCHR(st.st_mode)) {
338 log_error("/dev/console is not a char device.");
343 r = chmod_and_chown(console, 0600, 0, 0);
345 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
349 if (asprintf(&to, "%s/dev/console", dest) < 0) {
350 log_error("Out of memory");
355 /* We need to bind mount the right tty to /dev/console since
356 * ptys can only exist on pts file systems. To have something
357 * to bind mount things on we create a device node first, that
358 * has the right major/minor (note that the major minor
359 * doesn't actually matter here, since we mount it over
362 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
363 log_error("mknod() for /dev/console failed: %m");
368 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
369 log_error("Bind mount for /dev/console failed: %m");
381 static int setup_kmsg(const char *dest, int kmsg_socket) {
382 char *from = NULL, *to = NULL;
386 struct cmsghdr cmsghdr;
387 uint8_t buf[CMSG_SPACE(sizeof(int))];
390 struct cmsghdr *cmsg;
393 assert(kmsg_socket >= 0);
397 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
398 log_error("Out of memory");
403 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
404 log_error("Out of memory");
409 if (mkfifo(from, 0600) < 0) {
410 log_error("mkfifo() for /dev/kmsg failed: %m");
415 r = chmod_and_chown(from, 0600, 0, 0);
417 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
421 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
422 log_error("Bind mount for /proc/kmsg failed: %m");
427 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
429 log_error("Failed to open fifo: %m");
437 mh.msg_control = &control;
438 mh.msg_controllen = sizeof(control);
440 cmsg = CMSG_FIRSTHDR(&mh);
441 cmsg->cmsg_level = SOL_SOCKET;
442 cmsg->cmsg_type = SCM_RIGHTS;
443 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
444 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
446 mh.msg_controllen = cmsg->cmsg_len;
448 /* Store away the fd in the socket, so that it stays open as
449 * long as we run the child */
450 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
451 close_nointr_nofail(fd);
454 log_error("Failed to send FIFO fd: %m");
467 static int drop_capabilities(void) {
468 static const unsigned long retain[] = {
478 CAP_NET_BIND_SERVICE,
494 for (l = 0; l <= cap_last_cap(); l++) {
497 for (i = 0; i < ELEMENTSOF(retain); i++)
501 if (i < ELEMENTSOF(retain))
504 if (prctl(PR_CAPBSET_DROP, l) < 0) {
505 log_error("PR_CAPBSET_DROP failed: %m");
513 static int is_os_tree(const char *path) {
516 /* We use /bin/sh as flag file if something is an OS */
518 if (asprintf(&p, "%s/bin/sh", path) < 0)
524 return r < 0 ? 0 : 1;
527 static int process_pty(int master, sigset_t *mask) {
529 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
530 size_t in_buffer_full = 0, out_buffer_full = 0;
531 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
532 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
533 int ep = -1, signal_fd = -1, r;
535 fd_nonblock(STDIN_FILENO, 1);
536 fd_nonblock(STDOUT_FILENO, 1);
537 fd_nonblock(master, 1);
539 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
540 log_error("signalfd(): %m");
545 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
546 log_error("Failed to create epoll: %m");
552 stdin_ev.events = EPOLLIN|EPOLLET;
553 stdin_ev.data.fd = STDIN_FILENO;
556 stdout_ev.events = EPOLLOUT|EPOLLET;
557 stdout_ev.data.fd = STDOUT_FILENO;
560 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
561 master_ev.data.fd = master;
564 signal_ev.events = EPOLLIN;
565 signal_ev.data.fd = signal_fd;
567 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
568 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
569 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
570 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
571 log_error("Failed to regiser fds in epoll: %m");
577 struct epoll_event ev[16];
581 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
583 if (errno == EINTR || errno == EAGAIN)
586 log_error("epoll_wait(): %m");
593 for (i = 0; i < nfds; i++) {
594 if (ev[i].data.fd == STDIN_FILENO) {
596 if (ev[i].events & (EPOLLIN|EPOLLHUP))
597 stdin_readable = true;
599 } else if (ev[i].data.fd == STDOUT_FILENO) {
601 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
602 stdout_writable = true;
604 } else if (ev[i].data.fd == master) {
606 if (ev[i].events & (EPOLLIN|EPOLLHUP))
607 master_readable = true;
609 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
610 master_writable = true;
612 } else if (ev[i].data.fd == signal_fd) {
613 struct signalfd_siginfo sfsi;
616 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
619 log_error("Failed to read from signalfd: invalid block size");
624 if (errno != EINTR && errno != EAGAIN) {
625 log_error("Failed to read from signalfd: %m");
631 if (sfsi.ssi_signo == SIGWINCH) {
634 /* The window size changed, let's forward that. */
635 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
636 ioctl(master, TIOCSWINSZ, &ws);
645 while ((stdin_readable && in_buffer_full <= 0) ||
646 (master_writable && in_buffer_full > 0) ||
647 (master_readable && out_buffer_full <= 0) ||
648 (stdout_writable && out_buffer_full > 0)) {
650 if (stdin_readable && in_buffer_full < LINE_MAX) {
652 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
654 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
655 stdin_readable = false;
657 log_error("read(): %m");
662 in_buffer_full += (size_t) k;
665 if (master_writable && in_buffer_full > 0) {
667 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
669 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
670 master_writable = false;
672 log_error("write(): %m");
678 assert(in_buffer_full >= (size_t) k);
679 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
684 if (master_readable && out_buffer_full < LINE_MAX) {
686 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
688 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
689 master_readable = false;
691 log_error("read(): %m");
696 out_buffer_full += (size_t) k;
699 if (stdout_writable && out_buffer_full > 0) {
701 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
703 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
704 stdout_writable = false;
706 log_error("write(): %m");
712 assert(out_buffer_full >= (size_t) k);
713 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
714 out_buffer_full -= k;
722 close_nointr_nofail(ep);
725 close_nointr_nofail(signal_fd);
730 int main(int argc, char *argv[]) {
732 int r = EXIT_FAILURE, k;
733 char *oldcg = NULL, *newcg = NULL;
734 char **controller = NULL;
736 const char *console = NULL;
737 struct termios saved_attr, raw_attr;
739 bool saved_attr_valid = false;
741 int kmsg_socket_pair[2] = { -1, -1 };
743 log_parse_environment();
746 if ((r = parse_argv(argc, argv)) <= 0)
752 p = path_make_absolute_cwd(arg_directory);
756 arg_directory = get_current_dir_name();
758 if (!arg_directory) {
759 log_error("Failed to determine path");
763 path_kill_slashes(arg_directory);
765 if (geteuid() != 0) {
766 log_error("Need to be root.");
770 if (sd_booted() <= 0) {
771 log_error("Not running on a systemd system.");
775 if (path_equal(arg_directory, "/")) {
776 log_error("Spawning container on root directory not supported.");
780 if (is_os_tree(arg_directory) <= 0) {
781 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
785 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
786 log_error("Failed to determine current cgroup: %s", strerror(-k));
790 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
791 log_error("Failed to allocate cgroup path.");
795 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
797 log_error("Failed to create cgroup: %s", strerror(-k));
801 STRV_FOREACH(controller,arg_controllers) {
802 k = cg_create_and_attach(*controller, newcg, 0);
804 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
807 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
808 log_error("Failed to acquire pseudo tty: %m");
812 if (!(console = ptsname(master))) {
813 log_error("Failed to determine tty name: %m");
817 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
819 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
820 ioctl(master, TIOCSWINSZ, &ws);
822 if (unlockpt(master) < 0) {
823 log_error("Failed to unlock tty: %m");
827 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
828 log_error("Failed to get terminal attributes: %m");
832 saved_attr_valid = true;
834 raw_attr = saved_attr;
835 cfmakeraw(&raw_attr);
836 raw_attr.c_lflag &= ~ECHO;
838 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
839 log_error("Failed to set terminal attributes: %m");
843 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
844 log_error("Failed to create kmsg socket pair");
848 assert_se(sigemptyset(&mask) == 0);
849 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
850 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
852 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
855 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
857 log_error("clone() failed: %m");
866 const char *home = NULL;
867 uid_t uid = (uid_t) -1;
868 gid_t gid = (gid_t) -1;
869 const char *envp[] = {
870 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
871 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
879 envp[2] = strv_find_prefix(environ, "TERM=");
881 close_nointr_nofail(master);
883 close_nointr(STDIN_FILENO);
884 close_nointr(STDOUT_FILENO);
885 close_nointr(STDERR_FILENO);
887 close_all_fds(&kmsg_socket_pair[1], 1);
889 reset_all_signal_handlers();
891 assert_se(sigemptyset(&mask) == 0);
892 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
897 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
900 /* Mark / as private, in case somebody marked it shared */
901 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
904 if (mount_all(arg_directory) < 0)
907 if (copy_devnodes(arg_directory) < 0)
910 if (setup_dev_console(arg_directory, console) < 0)
913 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
916 close_nointr_nofail(kmsg_socket_pair[1]);
918 if (setup_timezone(arg_directory) < 0)
921 if (chdir(arg_directory) < 0) {
922 log_error("chdir(%s) failed: %m", arg_directory);
926 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
927 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
928 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
931 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
932 log_error("mount(MS_MOVE) failed: %m");
936 if (chroot(".") < 0) {
937 log_error("chroot() failed: %m");
941 if (chdir("/") < 0) {
942 log_error("chdir() failed: %m");
950 if (drop_capabilities() < 0)
955 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
956 log_error("get_user_creds() failed: %m");
960 if (mkdir_parents(home, 0775) < 0) {
961 log_error("mkdir_parents() failed: %m");
965 if (safe_mkdir(home, 0775, uid, gid) < 0) {
966 log_error("safe_mkdir() failed: %m");
970 if (initgroups((const char*)arg_user, gid) < 0) {
971 log_error("initgroups() failed: %m");
975 if (setresgid(gid, gid, gid) < 0) {
976 log_error("setregid() failed: %m");
980 if (setresuid(uid, uid, uid) < 0) {
981 log_error("setreuid() failed: %m");
986 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
987 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
988 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
989 log_error("Out of memory");
993 if ((hn = file_name_from_path(arg_directory)))
994 sethostname(hn, strlen(hn));
997 execvpe(argv[optind], argv + optind, (char**) envp);
999 chdir(home ? home : "/root");
1000 execle("/bin/bash", "-bash", NULL, (char**) envp);
1003 log_error("execv() failed: %m");
1006 _exit(EXIT_FAILURE);
1009 if (process_pty(master, &mask) < 0)
1012 if (saved_attr_valid) {
1013 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1014 saved_attr_valid = false;
1017 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1023 if (saved_attr_valid)
1024 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1027 close_nointr_nofail(master);
1029 close_pipe(kmsg_socket_pair);
1032 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1035 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1037 free(arg_directory);
1038 strv_free(arg_controllers);