1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
59 typedef enum LinkJournal {
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
76 (1ULL << CAP_DAC_OVERRIDE) |
77 (1ULL << CAP_DAC_READ_SEARCH) |
78 (1ULL << CAP_FOWNER) |
79 (1ULL << CAP_FSETID) |
80 (1ULL << CAP_IPC_OWNER) |
83 (1ULL << CAP_LINUX_IMMUTABLE) |
84 (1ULL << CAP_NET_BIND_SERVICE) |
85 (1ULL << CAP_NET_BROADCAST) |
86 (1ULL << CAP_NET_RAW) |
87 (1ULL << CAP_SETGID) |
88 (1ULL << CAP_SETFCAP) |
89 (1ULL << CAP_SETPCAP) |
90 (1ULL << CAP_SETUID) |
91 (1ULL << CAP_SYS_ADMIN) |
92 (1ULL << CAP_SYS_CHROOT) |
93 (1ULL << CAP_SYS_NICE) |
94 (1ULL << CAP_SYS_PTRACE) |
95 (1ULL << CAP_SYS_TTY_CONFIG) |
96 (1ULL << CAP_SYS_RESOURCE) |
97 (1ULL << CAP_SYS_BOOT);
99 static int help(void) {
101 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103 " -h --help Show this help\n"
104 " -D --directory=NAME Root directory for the container\n"
105 " -b --boot Boot up full system (i.e. invoke init)\n"
106 " -u --user=USER Run the command under specified user or uid\n"
107 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
108 " --uuid=UUID Set a specific machine UUID for the container\n"
109 " --private-network Disable network in container\n"
110 " --read-only Mount the root directory read-only\n"
111 " --capability=CAP In addition to the default, retain specified capability\n"
112 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
113 " -j Equivalent to --link-journal=host\n",
114 program_invocation_short_name);
119 static int parse_argv(int argc, char *argv[]) {
122 ARG_PRIVATE_NETWORK = 0x100,
129 static const struct option options[] = {
130 { "help", no_argument, NULL, 'h' },
131 { "directory", required_argument, NULL, 'D' },
132 { "user", required_argument, NULL, 'u' },
133 { "controllers", required_argument, NULL, 'C' },
134 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
135 { "boot", no_argument, NULL, 'b' },
136 { "uuid", required_argument, NULL, ARG_UUID },
137 { "read-only", no_argument, NULL, ARG_READ_ONLY },
138 { "capability", required_argument, NULL, ARG_CAPABILITY },
139 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
148 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158 arg_directory = canonicalize_file_name(optarg);
159 if (!arg_directory) {
160 log_error("Failed to canonicalize root directory.");
168 if (!(arg_user = strdup(optarg))) {
169 log_error("Failed to duplicate user name.");
176 strv_free(arg_controllers);
177 arg_controllers = strv_split(optarg, ",");
178 if (!arg_controllers) {
179 log_error("Failed to split controllers list.");
182 strv_uniq(arg_controllers);
186 case ARG_PRIVATE_NETWORK:
187 arg_private_network = true;
199 arg_read_only = true;
202 case ARG_CAPABILITY: {
206 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
210 t = strndup(word, length);
214 if (cap_from_name(t, &cap) < 0) {
215 log_error("Failed to parse capability %s.", t);
221 arg_retain |= 1ULL << (uint64_t) cap;
228 arg_link_journal = LINK_GUEST;
231 case ARG_LINK_JOURNAL:
232 if (streq(optarg, "auto"))
233 arg_link_journal = LINK_AUTO;
234 else if (streq(optarg, "no"))
235 arg_link_journal = LINK_NO;
236 else if (streq(optarg, "guest"))
237 arg_link_journal = LINK_GUEST;
238 else if (streq(optarg, "host"))
239 arg_link_journal = LINK_HOST;
241 log_error("Failed to parse link journal mode %s", optarg);
251 log_error("Unknown option code %c", c);
259 static int mount_all(const char *dest) {
261 typedef struct MountPoint {
270 static const MountPoint mount_table[] = {
271 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
272 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
277 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
280 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
281 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
288 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 char _cleanup_free_ *where = NULL;
292 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
301 t = path_is_mount_point(where, true);
303 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
315 mkdir_p_label(where, 0755);
317 if (mount(mount_table[k].what,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
324 log_error("mount(%s) failed: %m", where);
334 static int setup_timezone(const char *dest) {
335 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
341 /* Fix the timezone, if possible */
342 r = readlink_malloc("/etc/localtime", &p);
344 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
348 z = path_startswith(p, "../usr/share/zoneinfo/");
350 z = path_startswith(p, "/usr/share/zoneinfo/");
352 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
356 where = strappend(dest, "/etc/localtime");
360 r = readlink_malloc(where, &q);
362 y = path_startswith(q, "../usr/share/zoneinfo/");
364 y = path_startswith(q, "/usr/share/zoneinfo/");
367 /* Already pointing to the right place? Then do nothing .. */
368 if (y && streq(y, z))
372 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
376 if (access(check, F_OK) < 0) {
377 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
381 what = strappend("../usr/share/zoneinfo/", z);
386 if (symlink(what, where) < 0) {
387 log_error("Failed to correct timezone of container: %m");
394 static int setup_resolv_conf(const char *dest) {
399 if (arg_private_network)
402 /* Fix resolv.conf, if possible */
403 where = strappend(dest, "/etc/resolv.conf");
407 /* We don't really care for the results of this really. If it
408 * fails, it fails, but meh... */
409 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
410 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
417 static int setup_boot_id(const char *dest) {
418 char _cleanup_free_ *from = NULL, *to = NULL;
425 /* Generate a new randomized boot ID, so that each boot-up of
426 * the container gets a new one */
428 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
429 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
433 r = sd_id128_randomize(&rnd);
435 log_error("Failed to generate random boot id: %s", strerror(-r));
439 snprintf(as_uuid, sizeof(as_uuid),
440 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
441 SD_ID128_FORMAT_VAL(rnd));
442 char_array_0(as_uuid);
444 r = write_one_line_file(from, as_uuid);
446 log_error("Failed to write boot id: %s", strerror(-r));
450 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
451 log_error("Failed to bind mount boot id: %m");
454 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
460 static int copy_devnodes(const char *dest) {
462 static const char devnodes[] =
473 mode_t _cleanup_umask_ u;
479 NULSTR_FOREACH(d, devnodes) {
481 char _cleanup_free_ *from = NULL, *to = NULL;
483 asprintf(&from, "/dev/%s", d);
484 asprintf(&to, "%s/dev/%s", dest, d);
495 if (stat(from, &st) < 0) {
497 if (errno != ENOENT) {
498 log_error("Failed to stat %s: %m", from);
503 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
505 log_error("%s is not a char or block device, cannot copy", from);
509 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
511 log_error("mknod(%s) failed: %m", dest);
520 static int setup_dev_console(const char *dest, const char *console) {
522 char _cleanup_free_ *to = NULL;
524 mode_t _cleanup_umask_ u;
531 if (stat(console, &st) < 0) {
532 log_error("Failed to stat %s: %m", console);
535 } else if (!S_ISCHR(st.st_mode)) {
536 log_error("/dev/console is not a char device");
540 r = chmod_and_chown(console, 0600, 0, 0);
542 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
546 if (asprintf(&to, "%s/dev/console", dest) < 0)
549 /* We need to bind mount the right tty to /dev/console since
550 * ptys can only exist on pts file systems. To have something
551 * to bind mount things on we create a device node first, that
552 * has the right major/minor (note that the major minor
553 * doesn't actually matter here, since we mount it over
556 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
557 log_error("mknod() for /dev/console failed: %m");
561 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
562 log_error("Bind mount for /dev/console failed: %m");
569 static int setup_kmsg(const char *dest, int kmsg_socket) {
570 char _cleanup_free_ *from = NULL, *to = NULL;
572 mode_t _cleanup_umask_ u;
574 struct cmsghdr cmsghdr;
575 uint8_t buf[CMSG_SPACE(sizeof(int))];
578 struct cmsghdr *cmsg;
581 assert(kmsg_socket >= 0);
585 /* We create the kmsg FIFO as /dev/kmsg, but immediately
586 * delete it after bind mounting it to /proc/kmsg. While FIFOs
587 * on the reading side behave very similar to /proc/kmsg,
588 * their writing side behaves differently from /dev/kmsg in
589 * that writing blocks when nothing is reading. In order to
590 * avoid any problems with containers deadlocking due to this
591 * we simply make /dev/kmsg unavailable to the container. */
592 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
593 asprintf(&to, "%s/proc/kmsg", dest) < 0)
596 if (mkfifo(from, 0600) < 0) {
597 log_error("mkfifo() for /dev/kmsg failed: %m");
601 r = chmod_and_chown(from, 0600, 0, 0);
603 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
607 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
608 log_error("Bind mount for /proc/kmsg failed: %m");
612 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
614 log_error("Failed to open fifo: %m");
621 mh.msg_control = &control;
622 mh.msg_controllen = sizeof(control);
624 cmsg = CMSG_FIRSTHDR(&mh);
625 cmsg->cmsg_level = SOL_SOCKET;
626 cmsg->cmsg_type = SCM_RIGHTS;
627 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
628 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
630 mh.msg_controllen = cmsg->cmsg_len;
632 /* Store away the fd in the socket, so that it stays open as
633 * long as we run the child */
634 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
635 close_nointr_nofail(fd);
638 log_error("Failed to send FIFO fd: %m");
642 /* And now make the FIFO unavailable as /dev/kmsg... */
647 static int setup_hostname(void) {
651 hn = path_get_file_name(arg_directory);
657 hostname_cleanup(hn);
660 if (sethostname(hn, strlen(hn)) < 0)
669 static int setup_journal(const char *directory) {
670 sd_id128_t machine_id;
671 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
675 if (arg_link_journal == LINK_NO)
678 p = strappend(directory, "/etc/machine-id");
682 r = read_one_line_file(p, &b);
683 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
686 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
691 if (isempty(id) && arg_link_journal == LINK_AUTO)
694 /* Verify validity */
695 r = sd_id128_from_string(id, &machine_id);
697 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
702 p = strappend("/var/log/journal/", id);
703 q = strjoin(directory, "/var/log/journal/", id, NULL);
707 if (path_is_mount_point(p, false) > 0) {
708 if (arg_link_journal != LINK_AUTO) {
709 log_error("%s: already a mount point, refusing to use for journal", p);
716 if (path_is_mount_point(q, false) > 0) {
717 if (arg_link_journal != LINK_AUTO) {
718 log_error("%s: already a mount point, refusing to use for journal", q);
725 r = readlink_and_make_absolute(p, &d);
727 if ((arg_link_journal == LINK_GUEST ||
728 arg_link_journal == LINK_AUTO) &&
731 r = mkdir_p(q, 0755);
733 log_warning("failed to create directory %s: %m", q);
738 log_error("Failed to remove symlink %s: %m", p);
741 } else if (r == -EINVAL) {
743 if (arg_link_journal == LINK_GUEST &&
746 if (errno == ENOTDIR) {
747 log_error("%s already exists and is neither a symlink nor a directory", p);
750 log_error("Failed to remove %s: %m", p);
754 } else if (r != -ENOENT) {
755 log_error("readlink(%s) failed: %m", p);
759 if (arg_link_journal == LINK_GUEST) {
761 if (symlink(q, p) < 0) {
762 log_error("Failed to symlink %s to %s: %m", q, p);
766 r = mkdir_p(q, 0755);
768 log_warning("failed to create directory %s: %m", q);
772 if (arg_link_journal == LINK_HOST) {
773 r = mkdir_p(p, 0755);
775 log_error("Failed to create %s: %m", p);
779 } else if (access(p, F_OK) < 0)
782 if (dir_is_empty(q) == 0) {
783 log_error("%s not empty.", q);
787 r = mkdir_p(q, 0755);
789 log_error("Failed to create %s: %m", q);
793 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
794 log_error("Failed to bind mount journal from host into guest: %m");
801 static int drop_capabilities(void) {
802 return capability_bounding_set_drop(~arg_retain, false);
805 static int is_os_tree(const char *path) {
808 /* We use /bin/sh as flag file if something is an OS */
810 if (asprintf(&p, "%s/bin/sh", path) < 0)
816 return r < 0 ? 0 : 1;
819 static int process_pty(int master, sigset_t *mask) {
821 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
822 size_t in_buffer_full = 0, out_buffer_full = 0;
823 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
824 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
825 int ep = -1, signal_fd = -1, r;
827 fd_nonblock(STDIN_FILENO, 1);
828 fd_nonblock(STDOUT_FILENO, 1);
829 fd_nonblock(master, 1);
831 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
833 log_error("signalfd(): %m");
838 ep = epoll_create1(EPOLL_CLOEXEC);
840 log_error("Failed to create epoll: %m");
845 /* We read from STDIN only if this is actually a TTY,
846 * otherwise we assume non-interactivity. */
847 if (isatty(STDIN_FILENO)) {
849 stdin_ev.events = EPOLLIN|EPOLLET;
850 stdin_ev.data.fd = STDIN_FILENO;
852 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
853 log_error("Failed to register STDIN in epoll: %m");
860 stdout_ev.events = EPOLLOUT|EPOLLET;
861 stdout_ev.data.fd = STDOUT_FILENO;
864 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
865 master_ev.data.fd = master;
868 signal_ev.events = EPOLLIN;
869 signal_ev.data.fd = signal_fd;
871 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
872 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
873 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
874 log_error("Failed to register fds in epoll: %m");
880 struct epoll_event ev[16];
884 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
887 if (errno == EINTR || errno == EAGAIN)
890 log_error("epoll_wait(): %m");
897 for (i = 0; i < nfds; i++) {
898 if (ev[i].data.fd == STDIN_FILENO) {
900 if (ev[i].events & (EPOLLIN|EPOLLHUP))
901 stdin_readable = true;
903 } else if (ev[i].data.fd == STDOUT_FILENO) {
905 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
906 stdout_writable = true;
908 } else if (ev[i].data.fd == master) {
910 if (ev[i].events & (EPOLLIN|EPOLLHUP))
911 master_readable = true;
913 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
914 master_writable = true;
916 } else if (ev[i].data.fd == signal_fd) {
917 struct signalfd_siginfo sfsi;
920 n = read(signal_fd, &sfsi, sizeof(sfsi));
921 if (n != sizeof(sfsi)) {
924 log_error("Failed to read from signalfd: invalid block size");
929 if (errno != EINTR && errno != EAGAIN) {
930 log_error("Failed to read from signalfd: %m");
936 if (sfsi.ssi_signo == SIGWINCH) {
939 /* The window size changed, let's forward that. */
940 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
941 ioctl(master, TIOCSWINSZ, &ws);
950 while ((stdin_readable && in_buffer_full <= 0) ||
951 (master_writable && in_buffer_full > 0) ||
952 (master_readable && out_buffer_full <= 0) ||
953 (stdout_writable && out_buffer_full > 0)) {
955 if (stdin_readable && in_buffer_full < LINE_MAX) {
957 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
960 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
961 stdin_readable = false;
963 log_error("read(): %m");
968 in_buffer_full += (size_t) k;
971 if (master_writable && in_buffer_full > 0) {
973 k = write(master, in_buffer, in_buffer_full);
976 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
977 master_writable = false;
979 log_error("write(): %m");
985 assert(in_buffer_full >= (size_t) k);
986 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
991 if (master_readable && out_buffer_full < LINE_MAX) {
993 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
996 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
997 master_readable = false;
999 log_error("read(): %m");
1004 out_buffer_full += (size_t) k;
1007 if (stdout_writable && out_buffer_full > 0) {
1009 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1012 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1013 stdout_writable = false;
1015 log_error("write(): %m");
1021 assert(out_buffer_full >= (size_t) k);
1022 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1023 out_buffer_full -= k;
1031 close_nointr_nofail(ep);
1034 close_nointr_nofail(signal_fd);
1039 int main(int argc, char *argv[]) {
1041 int r = EXIT_FAILURE, k;
1042 char *oldcg = NULL, *newcg = NULL;
1043 char **controller = NULL;
1045 const char *console = NULL;
1046 struct termios saved_attr, raw_attr;
1048 bool saved_attr_valid = false;
1050 int kmsg_socket_pair[2] = { -1, -1 };
1052 log_parse_environment();
1055 r = parse_argv(argc, argv);
1059 if (arg_directory) {
1062 p = path_make_absolute_cwd(arg_directory);
1063 free(arg_directory);
1066 arg_directory = get_current_dir_name();
1068 if (!arg_directory) {
1069 log_error("Failed to determine path");
1073 path_kill_slashes(arg_directory);
1075 if (geteuid() != 0) {
1076 log_error("Need to be root.");
1080 if (sd_booted() <= 0) {
1081 log_error("Not running on a systemd system.");
1085 if (path_equal(arg_directory, "/")) {
1086 log_error("Spawning container on root directory not supported.");
1090 if (is_os_tree(arg_directory) <= 0) {
1091 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1095 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1097 log_error("Failed to determine current cgroup: %s", strerror(-k));
1101 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1102 log_error("Failed to allocate cgroup path.");
1106 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1108 log_error("Failed to create cgroup: %s", strerror(-k));
1112 STRV_FOREACH(controller, arg_controllers) {
1113 k = cg_create_and_attach(*controller, newcg, 0);
1115 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1118 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1120 log_error("Failed to acquire pseudo tty: %m");
1124 console = ptsname(master);
1126 log_error("Failed to determine tty name: %m");
1130 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1132 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1133 ioctl(master, TIOCSWINSZ, &ws);
1135 if (unlockpt(master) < 0) {
1136 log_error("Failed to unlock tty: %m");
1140 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1141 saved_attr_valid = true;
1143 raw_attr = saved_attr;
1144 cfmakeraw(&raw_attr);
1145 raw_attr.c_lflag &= ~ECHO;
1148 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1149 log_error("Failed to create kmsg socket pair");
1153 assert_se(sigemptyset(&mask) == 0);
1154 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1155 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1160 if (saved_attr_valid) {
1161 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1162 log_error("Failed to set terminal attributes: %m");
1167 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1169 if (errno == EINVAL)
1170 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1172 log_error("clone() failed: %m");
1180 const char *home = NULL;
1181 uid_t uid = (uid_t) -1;
1182 gid_t gid = (gid_t) -1;
1183 const char *envp[] = {
1184 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1185 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1190 NULL, /* container_uuid */
1194 envp[2] = strv_find_prefix(environ, "TERM=");
1196 close_nointr_nofail(master);
1198 close_nointr(STDIN_FILENO);
1199 close_nointr(STDOUT_FILENO);
1200 close_nointr(STDERR_FILENO);
1202 close_all_fds(&kmsg_socket_pair[1], 1);
1204 reset_all_signal_handlers();
1206 assert_se(sigemptyset(&mask) == 0);
1207 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1209 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1210 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1211 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1215 log_error("setsid() failed: %m");
1219 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1220 log_error("PR_SET_PDEATHSIG failed: %m");
1224 /* Mark everything as slave, so that we still
1225 * receive mounts from the real root, but don't
1226 * propagate mounts to the real root. */
1227 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1228 log_error("MS_SLAVE|MS_REC failed: %m");
1232 /* Turn directory into bind mount */
1233 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1234 log_error("Failed to make bind mount.");
1239 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1240 log_error("Failed to make read-only.");
1244 if (mount_all(arg_directory) < 0)
1247 if (copy_devnodes(arg_directory) < 0)
1250 dev_setup(arg_directory);
1252 if (setup_dev_console(arg_directory, console) < 0)
1255 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1258 close_nointr_nofail(kmsg_socket_pair[1]);
1260 if (setup_boot_id(arg_directory) < 0)
1263 if (setup_timezone(arg_directory) < 0)
1266 if (setup_resolv_conf(arg_directory) < 0)
1269 if (setup_journal(arg_directory) < 0)
1272 if (chdir(arg_directory) < 0) {
1273 log_error("chdir(%s) failed: %m", arg_directory);
1277 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1278 log_error("mount(MS_MOVE) failed: %m");
1282 if (chroot(".") < 0) {
1283 log_error("chroot() failed: %m");
1287 if (chdir("/") < 0) {
1288 log_error("chdir() failed: %m");
1296 if (drop_capabilities() < 0) {
1297 log_error("drop_capabilities() failed: %m");
1303 /* Note that this resolves user names
1304 * inside the container, and hence
1305 * accesses the NSS modules from the
1306 * container and not the host. This is
1309 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1310 log_error("get_user_creds() failed: %m");
1314 if (mkdir_parents_label(home, 0775) < 0) {
1315 log_error("mkdir_parents_label() failed: %m");
1319 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1320 log_error("mkdir_safe_label() failed: %m");
1324 if (initgroups((const char*)arg_user, gid) < 0) {
1325 log_error("initgroups() failed: %m");
1329 if (setresgid(gid, gid, gid) < 0) {
1330 log_error("setregid() failed: %m");
1334 if (setresuid(uid, uid, uid) < 0) {
1335 log_error("setreuid() failed: %m");
1339 /* Reset everything fully to 0, just in case */
1341 if (setgroups(0, NULL) < 0) {
1342 log_error("setgroups() failed: %m");
1346 if (setresgid(0, 0, 0) < 0) {
1347 log_error("setregid() failed: %m");
1351 if (setresuid(0, 0, 0) < 0) {
1352 log_error("setreuid() failed: %m");
1357 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1358 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1359 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1365 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1377 /* Automatically search for the init system */
1379 l = 1 + argc - optind;
1380 a = newa(char*, l + 1);
1381 memcpy(a + 1, argv + optind, l * sizeof(char*));
1383 a[0] = (char*) "/usr/lib/systemd/systemd";
1384 execve(a[0], a, (char**) envp);
1386 a[0] = (char*) "/lib/systemd/systemd";
1387 execve(a[0], a, (char**) envp);
1389 a[0] = (char*) "/sbin/init";
1390 execve(a[0], a, (char**) envp);
1391 } else if (argc > optind)
1392 execvpe(argv[optind], argv + optind, (char**) envp);
1394 chdir(home ? home : "/root");
1395 execle("/bin/bash", "-bash", NULL, (char**) envp);
1398 log_error("execv() failed: %m");
1401 _exit(EXIT_FAILURE);
1404 if (process_pty(master, &mask) < 0)
1408 if (saved_attr_valid)
1409 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1411 r = wait_for_terminate(pid, &status);
1417 if (status.si_code == CLD_EXITED) {
1418 if (status.si_status != 0) {
1419 log_error("Container failed with error code %i.", status.si_status);
1420 r = status.si_status;
1424 log_debug("Container exited successfully.");
1426 } else if (status.si_code == CLD_KILLED &&
1427 status.si_status == SIGINT) {
1428 log_info("Container has been shut down.");
1431 } else if (status.si_code == CLD_KILLED &&
1432 status.si_status == SIGHUP) {
1433 log_info("Container is being rebooted.");
1435 } else if (status.si_code == CLD_KILLED ||
1436 status.si_code == CLD_DUMPED) {
1438 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1442 log_error("Container failed due to unknown reason.");
1449 if (saved_attr_valid)
1450 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1453 close_nointr_nofail(master);
1455 close_pipe(kmsg_socket_pair);
1458 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1461 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1463 free(arg_directory);
1464 strv_free(arg_controllers);