1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
59 typedef enum LinkJournal {
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
76 (1ULL << CAP_DAC_OVERRIDE) |
77 (1ULL << CAP_DAC_READ_SEARCH) |
78 (1ULL << CAP_FOWNER) |
79 (1ULL << CAP_FSETID) |
80 (1ULL << CAP_IPC_OWNER) |
83 (1ULL << CAP_LINUX_IMMUTABLE) |
84 (1ULL << CAP_NET_BIND_SERVICE) |
85 (1ULL << CAP_NET_BROADCAST) |
86 (1ULL << CAP_NET_RAW) |
87 (1ULL << CAP_SETGID) |
88 (1ULL << CAP_SETFCAP) |
89 (1ULL << CAP_SETPCAP) |
90 (1ULL << CAP_SETUID) |
91 (1ULL << CAP_SYS_ADMIN) |
92 (1ULL << CAP_SYS_CHROOT) |
93 (1ULL << CAP_SYS_NICE) |
94 (1ULL << CAP_SYS_PTRACE) |
95 (1ULL << CAP_SYS_TTY_CONFIG) |
96 (1ULL << CAP_SYS_RESOURCE) |
97 (1ULL << CAP_SYS_BOOT);
99 static int help(void) {
101 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103 " -h --help Show this help\n"
104 " -D --directory=NAME Root directory for the container\n"
105 " -b --boot Boot up full system (i.e. invoke init)\n"
106 " -u --user=USER Run the command under specified user or uid\n"
107 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
108 " --uuid=UUID Set a specific machine UUID for the container\n"
109 " --private-network Disable network in container\n"
110 " --read-only Mount the root directory read-only\n"
111 " --capability=CAP In addition to the default, retain specified capability\n"
112 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
113 " -j Equivalent to --link-journal=host\n",
114 program_invocation_short_name);
119 static int parse_argv(int argc, char *argv[]) {
122 ARG_PRIVATE_NETWORK = 0x100,
129 static const struct option options[] = {
130 { "help", no_argument, NULL, 'h' },
131 { "directory", required_argument, NULL, 'D' },
132 { "user", required_argument, NULL, 'u' },
133 { "controllers", required_argument, NULL, 'C' },
134 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
135 { "boot", no_argument, NULL, 'b' },
136 { "uuid", required_argument, NULL, ARG_UUID },
137 { "read-only", no_argument, NULL, ARG_READ_ONLY },
138 { "capability", required_argument, NULL, ARG_CAPABILITY },
139 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
148 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158 arg_directory = canonicalize_file_name(optarg);
159 if (!arg_directory) {
160 log_error("Failed to canonicalize root directory.");
168 if (!(arg_user = strdup(optarg))) {
169 log_error("Failed to duplicate user name.");
176 strv_free(arg_controllers);
177 arg_controllers = strv_split(optarg, ",");
178 if (!arg_controllers) {
179 log_error("Failed to split controllers list.");
182 strv_uniq(arg_controllers);
186 case ARG_PRIVATE_NETWORK:
187 arg_private_network = true;
199 arg_read_only = true;
202 case ARG_CAPABILITY: {
206 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
210 t = strndup(word, length);
214 if (cap_from_name(t, &cap) < 0) {
215 log_error("Failed to parse capability %s.", t);
221 arg_retain |= 1ULL << (uint64_t) cap;
228 arg_link_journal = LINK_GUEST;
231 case ARG_LINK_JOURNAL:
232 if (streq(optarg, "auto"))
233 arg_link_journal = LINK_AUTO;
234 else if (streq(optarg, "no"))
235 arg_link_journal = LINK_NO;
236 else if (streq(optarg, "guest"))
237 arg_link_journal = LINK_GUEST;
238 else if (streq(optarg, "host"))
239 arg_link_journal = LINK_HOST;
241 log_error("Failed to parse link journal mode %s", optarg);
251 log_error("Unknown option code %c", c);
259 static int mount_all(const char *dest) {
261 typedef struct MountPoint {
270 static const MountPoint mount_table[] = {
271 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
272 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
277 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
280 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
281 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
288 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 char _cleanup_free_ *where = NULL;
292 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
301 t = path_is_mount_point(where, true);
303 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
315 mkdir_p_label(where, 0755);
317 if (mount(mount_table[k].what,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
324 log_error("mount(%s) failed: %m", where);
334 static int setup_timezone(const char *dest) {
335 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
341 /* Fix the timezone, if possible */
342 r = readlink_malloc("/etc/localtime", &p);
344 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
348 z = path_startswith(p, "../usr/share/zoneinfo/");
350 z = path_startswith(p, "/usr/share/zoneinfo/");
352 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
356 where = strappend(dest, "/etc/localtime");
360 r = readlink_malloc(where, &q);
362 y = path_startswith(q, "../usr/share/zoneinfo/");
364 y = path_startswith(q, "/usr/share/zoneinfo/");
367 /* Already pointing to the right place? Then do nothing .. */
368 if (y && streq(y, z))
372 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
376 if (access(check, F_OK) < 0) {
377 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
381 what = strappend("../usr/share/zoneinfo/", z);
386 if (symlink(what, where) < 0) {
387 log_error("Failed to correct timezone of container: %m");
394 static int setup_resolv_conf(const char *dest) {
399 if (arg_private_network)
402 /* Fix resolv.conf, if possible */
403 where = strappend(dest, "/etc/resolv.conf");
407 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
408 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
415 static int setup_boot_id(const char *dest) {
416 char _cleanup_free_ *from = NULL, *to = NULL;
423 /* Generate a new randomized boot ID, so that each boot-up of
424 * the container gets a new one */
426 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
427 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
431 r = sd_id128_randomize(&rnd);
433 log_error("Failed to generate random boot id: %s", strerror(-r));
437 snprintf(as_uuid, sizeof(as_uuid),
438 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
439 SD_ID128_FORMAT_VAL(rnd));
440 char_array_0(as_uuid);
442 r = write_one_line_file(from, as_uuid);
444 log_error("Failed to write boot id: %s", strerror(-r));
448 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
449 log_error("Failed to bind mount boot id: %m");
452 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
458 static int copy_devnodes(const char *dest) {
460 static const char devnodes[] =
471 mode_t _cleanup_umask_ u;
477 NULSTR_FOREACH(d, devnodes) {
479 char _cleanup_free_ *from = NULL, *to = NULL;
481 asprintf(&from, "/dev/%s", d);
482 asprintf(&to, "%s/dev/%s", dest, d);
493 if (stat(from, &st) < 0) {
495 if (errno != ENOENT) {
496 log_error("Failed to stat %s: %m", from);
501 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
503 log_error("%s is not a char or block device, cannot copy", from);
507 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
509 log_error("mknod(%s) failed: %m", dest);
518 static int setup_dev_console(const char *dest, const char *console) {
520 char _cleanup_free_ *to = NULL;
522 mode_t _cleanup_umask_ u;
529 if (stat(console, &st) < 0) {
530 log_error("Failed to stat %s: %m", console);
533 } else if (!S_ISCHR(st.st_mode)) {
534 log_error("/dev/console is not a char device");
538 r = chmod_and_chown(console, 0600, 0, 0);
540 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
544 if (asprintf(&to, "%s/dev/console", dest) < 0)
547 /* We need to bind mount the right tty to /dev/console since
548 * ptys can only exist on pts file systems. To have something
549 * to bind mount things on we create a device node first, that
550 * has the right major/minor (note that the major minor
551 * doesn't actually matter here, since we mount it over
554 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
555 log_error("mknod() for /dev/console failed: %m");
559 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
560 log_error("Bind mount for /dev/console failed: %m");
567 static int setup_kmsg(const char *dest, int kmsg_socket) {
568 char _cleanup_free_ *from = NULL, *to = NULL;
570 mode_t _cleanup_umask_ u;
572 struct cmsghdr cmsghdr;
573 uint8_t buf[CMSG_SPACE(sizeof(int))];
576 struct cmsghdr *cmsg;
579 assert(kmsg_socket >= 0);
583 /* We create the kmsg FIFO as /dev/kmsg, but immediately
584 * delete it after bind mounting it to /proc/kmsg. While FIFOs
585 * on the reading side behave very similar to /proc/kmsg,
586 * their writing side behaves differently from /dev/kmsg in
587 * that writing blocks when nothing is reading. In order to
588 * avoid any problems with containers deadlocking due to this
589 * we simply make /dev/kmsg unavailable to the container. */
590 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
591 asprintf(&to, "%s/proc/kmsg", dest) < 0)
594 if (mkfifo(from, 0600) < 0) {
595 log_error("mkfifo() for /dev/kmsg failed: %m");
599 r = chmod_and_chown(from, 0600, 0, 0);
601 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
605 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
606 log_error("Bind mount for /proc/kmsg failed: %m");
610 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
612 log_error("Failed to open fifo: %m");
619 mh.msg_control = &control;
620 mh.msg_controllen = sizeof(control);
622 cmsg = CMSG_FIRSTHDR(&mh);
623 cmsg->cmsg_level = SOL_SOCKET;
624 cmsg->cmsg_type = SCM_RIGHTS;
625 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
626 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
628 mh.msg_controllen = cmsg->cmsg_len;
630 /* Store away the fd in the socket, so that it stays open as
631 * long as we run the child */
632 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
633 close_nointr_nofail(fd);
636 log_error("Failed to send FIFO fd: %m");
640 /* And now make the FIFO unavailable as /dev/kmsg... */
645 static int setup_hostname(void) {
649 hn = path_get_file_name(arg_directory);
655 hostname_cleanup(hn);
658 if (sethostname(hn, strlen(hn)) < 0)
667 static int setup_journal(const char *directory) {
668 sd_id128_t machine_id;
669 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
672 if (arg_link_journal == LINK_NO)
675 p = strappend(directory, "/etc/machine-id");
681 r = read_one_line_file(p, &b);
682 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
686 log_error("Failed to read machine ID: %s", strerror(-r));
691 if (isempty(l) && arg_link_journal == LINK_AUTO) {
696 /* Verify validaty */
697 r = sd_id128_from_string(l, &machine_id);
699 log_error("Failed to parse machine ID: %s", strerror(-r));
704 p = strappend("/var/log/journal/", l);
705 q = strjoin(directory, "/var/log/journal/", l, NULL);
711 if (path_is_mount_point(p, false) > 0 ||
712 path_is_mount_point(q, false) > 0) {
713 if (arg_link_journal != LINK_AUTO) {
714 log_error("Journal already a mount point, refusing.");
723 r = readlink_and_make_absolute(p, &d);
725 if ((arg_link_journal == LINK_GUEST ||
726 arg_link_journal == LINK_AUTO) &&
736 log_error("Failed to remove symlink %s: %m", p);
740 } else if (r == -EINVAL) {
742 if (arg_link_journal == LINK_GUEST &&
745 if (errno == ENOTDIR)
746 log_error("%s already exists and is neither symlink nor directory.", p);
748 log_error("Failed to remove %s: %m", p);
754 } else if (r != -ENOENT) {
755 log_error("readlink(%s) failed: %m", p);
759 if (arg_link_journal == LINK_GUEST) {
761 if (symlink(q, p) < 0) {
762 log_error("Failed to symlink %s to %s: %m", q, p);
773 if (arg_link_journal == LINK_HOST) {
774 r = mkdir_p(p, 0755);
776 log_error("Failed to create %s: %m", p);
780 } else if (access(p, F_OK) < 0) {
785 if (dir_is_empty(q) == 0) {
786 log_error("%s not empty.", q);
791 r = mkdir_p(q, 0755);
793 log_error("Failed to create %s: %m", q);
797 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
798 log_error("Failed to bind mount journal from host into guest: %m");
814 static int drop_capabilities(void) {
815 return capability_bounding_set_drop(~arg_retain, false);
818 static int is_os_tree(const char *path) {
821 /* We use /bin/sh as flag file if something is an OS */
823 if (asprintf(&p, "%s/bin/sh", path) < 0)
829 return r < 0 ? 0 : 1;
832 static int process_pty(int master, sigset_t *mask) {
834 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
835 size_t in_buffer_full = 0, out_buffer_full = 0;
836 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
837 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
838 int ep = -1, signal_fd = -1, r;
840 fd_nonblock(STDIN_FILENO, 1);
841 fd_nonblock(STDOUT_FILENO, 1);
842 fd_nonblock(master, 1);
844 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
846 log_error("signalfd(): %m");
851 ep = epoll_create1(EPOLL_CLOEXEC);
853 log_error("Failed to create epoll: %m");
859 stdin_ev.events = EPOLLIN|EPOLLET;
860 stdin_ev.data.fd = STDIN_FILENO;
863 stdout_ev.events = EPOLLOUT|EPOLLET;
864 stdout_ev.data.fd = STDOUT_FILENO;
867 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
868 master_ev.data.fd = master;
871 signal_ev.events = EPOLLIN;
872 signal_ev.data.fd = signal_fd;
874 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
875 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
876 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
877 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
878 log_error("Failed to regiser fds in epoll: %m");
884 struct epoll_event ev[16];
888 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
891 if (errno == EINTR || errno == EAGAIN)
894 log_error("epoll_wait(): %m");
901 for (i = 0; i < nfds; i++) {
902 if (ev[i].data.fd == STDIN_FILENO) {
904 if (ev[i].events & (EPOLLIN|EPOLLHUP))
905 stdin_readable = true;
907 } else if (ev[i].data.fd == STDOUT_FILENO) {
909 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
910 stdout_writable = true;
912 } else if (ev[i].data.fd == master) {
914 if (ev[i].events & (EPOLLIN|EPOLLHUP))
915 master_readable = true;
917 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
918 master_writable = true;
920 } else if (ev[i].data.fd == signal_fd) {
921 struct signalfd_siginfo sfsi;
924 n = read(signal_fd, &sfsi, sizeof(sfsi));
925 if (n != sizeof(sfsi)) {
928 log_error("Failed to read from signalfd: invalid block size");
933 if (errno != EINTR && errno != EAGAIN) {
934 log_error("Failed to read from signalfd: %m");
940 if (sfsi.ssi_signo == SIGWINCH) {
943 /* The window size changed, let's forward that. */
944 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
945 ioctl(master, TIOCSWINSZ, &ws);
954 while ((stdin_readable && in_buffer_full <= 0) ||
955 (master_writable && in_buffer_full > 0) ||
956 (master_readable && out_buffer_full <= 0) ||
957 (stdout_writable && out_buffer_full > 0)) {
959 if (stdin_readable && in_buffer_full < LINE_MAX) {
961 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
964 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
965 stdin_readable = false;
967 log_error("read(): %m");
972 in_buffer_full += (size_t) k;
975 if (master_writable && in_buffer_full > 0) {
977 k = write(master, in_buffer, in_buffer_full);
980 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
981 master_writable = false;
983 log_error("write(): %m");
989 assert(in_buffer_full >= (size_t) k);
990 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
995 if (master_readable && out_buffer_full < LINE_MAX) {
997 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1000 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1001 master_readable = false;
1003 log_error("read(): %m");
1008 out_buffer_full += (size_t) k;
1011 if (stdout_writable && out_buffer_full > 0) {
1013 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1016 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1017 stdout_writable = false;
1019 log_error("write(): %m");
1025 assert(out_buffer_full >= (size_t) k);
1026 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1027 out_buffer_full -= k;
1035 close_nointr_nofail(ep);
1038 close_nointr_nofail(signal_fd);
1043 int main(int argc, char *argv[]) {
1045 int r = EXIT_FAILURE, k;
1046 char *oldcg = NULL, *newcg = NULL;
1047 char **controller = NULL;
1049 const char *console = NULL;
1050 struct termios saved_attr, raw_attr;
1052 bool saved_attr_valid = false;
1054 int kmsg_socket_pair[2] = { -1, -1 };
1056 log_parse_environment();
1059 r = parse_argv(argc, argv);
1063 if (arg_directory) {
1066 p = path_make_absolute_cwd(arg_directory);
1067 free(arg_directory);
1070 arg_directory = get_current_dir_name();
1072 if (!arg_directory) {
1073 log_error("Failed to determine path");
1077 path_kill_slashes(arg_directory);
1079 if (geteuid() != 0) {
1080 log_error("Need to be root.");
1084 if (sd_booted() <= 0) {
1085 log_error("Not running on a systemd system.");
1089 if (path_equal(arg_directory, "/")) {
1090 log_error("Spawning container on root directory not supported.");
1094 if (is_os_tree(arg_directory) <= 0) {
1095 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1099 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1101 log_error("Failed to determine current cgroup: %s", strerror(-k));
1105 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1106 log_error("Failed to allocate cgroup path.");
1110 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1112 log_error("Failed to create cgroup: %s", strerror(-k));
1116 STRV_FOREACH(controller, arg_controllers) {
1117 k = cg_create_and_attach(*controller, newcg, 0);
1119 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1122 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1124 log_error("Failed to acquire pseudo tty: %m");
1128 console = ptsname(master);
1130 log_error("Failed to determine tty name: %m");
1134 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1136 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1137 ioctl(master, TIOCSWINSZ, &ws);
1139 if (unlockpt(master) < 0) {
1140 log_error("Failed to unlock tty: %m");
1144 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1145 log_error("Failed to get terminal attributes: %m");
1149 saved_attr_valid = true;
1151 raw_attr = saved_attr;
1152 cfmakeraw(&raw_attr);
1153 raw_attr.c_lflag &= ~ECHO;
1155 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1156 log_error("Failed to create kmsg socket pair");
1160 assert_se(sigemptyset(&mask) == 0);
1161 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1162 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1167 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1168 log_error("Failed to set terminal attributes: %m");
1172 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1174 if (errno == EINVAL)
1175 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1177 log_error("clone() failed: %m");
1185 const char *home = NULL;
1186 uid_t uid = (uid_t) -1;
1187 gid_t gid = (gid_t) -1;
1188 const char *envp[] = {
1189 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1190 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1195 NULL, /* container_uuid */
1199 envp[2] = strv_find_prefix(environ, "TERM=");
1201 close_nointr_nofail(master);
1203 close_nointr(STDIN_FILENO);
1204 close_nointr(STDOUT_FILENO);
1205 close_nointr(STDERR_FILENO);
1207 close_all_fds(&kmsg_socket_pair[1], 1);
1209 reset_all_signal_handlers();
1211 assert_se(sigemptyset(&mask) == 0);
1212 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1214 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1215 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1216 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1220 log_error("setsid() failed: %m");
1224 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1225 log_error("PR_SET_PDEATHSIG failed: %m");
1229 /* Mark everything as slave, so that we still
1230 * receive mounts from the real root, but don't
1231 * propagate mounts to the real root. */
1232 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1233 log_error("MS_SLAVE|MS_REC failed: %m");
1237 /* Turn directory into bind mount */
1238 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1239 log_error("Failed to make bind mount.");
1244 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1245 log_error("Failed to make read-only.");
1249 if (mount_all(arg_directory) < 0)
1252 if (copy_devnodes(arg_directory) < 0)
1255 dev_setup(arg_directory);
1257 if (setup_dev_console(arg_directory, console) < 0)
1260 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1263 close_nointr_nofail(kmsg_socket_pair[1]);
1265 if (setup_boot_id(arg_directory) < 0)
1268 if (setup_timezone(arg_directory) < 0)
1271 if (setup_resolv_conf(arg_directory) < 0)
1274 if (setup_journal(arg_directory) < 0)
1277 if (chdir(arg_directory) < 0) {
1278 log_error("chdir(%s) failed: %m", arg_directory);
1282 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1283 log_error("mount(MS_MOVE) failed: %m");
1287 if (chroot(".") < 0) {
1288 log_error("chroot() failed: %m");
1292 if (chdir("/") < 0) {
1293 log_error("chdir() failed: %m");
1301 if (drop_capabilities() < 0) {
1302 log_error("drop_capabilities() failed: %m");
1308 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1309 log_error("get_user_creds() failed: %m");
1313 if (mkdir_parents_label(home, 0775) < 0) {
1314 log_error("mkdir_parents_label() failed: %m");
1318 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1319 log_error("mkdir_safe_label() failed: %m");
1323 if (initgroups((const char*)arg_user, gid) < 0) {
1324 log_error("initgroups() failed: %m");
1328 if (setresgid(gid, gid, gid) < 0) {
1329 log_error("setregid() failed: %m");
1333 if (setresuid(uid, uid, uid) < 0) {
1334 log_error("setreuid() failed: %m");
1339 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1340 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1341 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1347 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1359 /* Automatically search for the init system */
1361 l = 1 + argc - optind;
1362 a = newa(char*, l + 1);
1363 memcpy(a + 1, argv + optind, l * sizeof(char*));
1365 a[0] = (char*) "/usr/lib/systemd/systemd";
1366 execve(a[0], a, (char**) envp);
1368 a[0] = (char*) "/lib/systemd/systemd";
1369 execve(a[0], a, (char**) envp);
1371 a[0] = (char*) "/sbin/init";
1372 execve(a[0], a, (char**) envp);
1373 } else if (argc > optind)
1374 execvpe(argv[optind], argv + optind, (char**) envp);
1376 chdir(home ? home : "/root");
1377 execle("/bin/bash", "-bash", NULL, (char**) envp);
1380 log_error("execv() failed: %m");
1383 _exit(EXIT_FAILURE);
1386 if (process_pty(master, &mask) < 0)
1390 if (saved_attr_valid)
1391 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1393 r = wait_for_terminate(pid, &status);
1399 if (status.si_code == CLD_EXITED) {
1400 if (status.si_status != 0) {
1401 log_error("Container failed with error code %i.", status.si_status);
1402 r = status.si_status;
1406 log_debug("Container exited successfully.");
1408 } else if (status.si_code == CLD_KILLED &&
1409 status.si_status == SIGINT) {
1410 log_info("Container has been shut down.");
1413 } else if (status.si_code == CLD_KILLED &&
1414 status.si_status == SIGHUP) {
1415 log_info("Container is being rebooted.");
1417 } else if (status.si_code == CLD_KILLED ||
1418 status.si_code == CLD_DUMPED) {
1420 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1424 log_error("Container failed due to unknown reason.");
1431 if (saved_attr_valid)
1432 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1435 close_nointr_nofail(master);
1437 close_pipe(kmsg_socket_pair);
1440 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1443 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1445 free(arg_directory);
1446 strv_free(arg_controllers);