1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
59 typedef enum LinkJournal {
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
76 (1ULL << CAP_DAC_OVERRIDE) |
77 (1ULL << CAP_DAC_READ_SEARCH) |
78 (1ULL << CAP_FOWNER) |
79 (1ULL << CAP_FSETID) |
80 (1ULL << CAP_IPC_OWNER) |
83 (1ULL << CAP_LINUX_IMMUTABLE) |
84 (1ULL << CAP_NET_BIND_SERVICE) |
85 (1ULL << CAP_NET_BROADCAST) |
86 (1ULL << CAP_NET_RAW) |
87 (1ULL << CAP_SETGID) |
88 (1ULL << CAP_SETFCAP) |
89 (1ULL << CAP_SETPCAP) |
90 (1ULL << CAP_SETUID) |
91 (1ULL << CAP_SYS_ADMIN) |
92 (1ULL << CAP_SYS_CHROOT) |
93 (1ULL << CAP_SYS_NICE) |
94 (1ULL << CAP_SYS_PTRACE) |
95 (1ULL << CAP_SYS_TTY_CONFIG) |
96 (1ULL << CAP_SYS_RESOURCE) |
97 (1ULL << CAP_SYS_BOOT);
99 static int help(void) {
101 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103 " -h --help Show this help\n"
104 " -D --directory=NAME Root directory for the container\n"
105 " -b --boot Boot up full system (i.e. invoke init)\n"
106 " -u --user=USER Run the command under specified user or uid\n"
107 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
108 " --uuid=UUID Set a specific machine UUID for the container\n"
109 " --private-network Disable network in container\n"
110 " --read-only Mount the root directory read-only\n"
111 " --capability=CAP In addition to the default, retain specified capability\n"
112 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
113 " -j Equivalent to --link-journal=host\n",
114 program_invocation_short_name);
119 static int parse_argv(int argc, char *argv[]) {
122 ARG_PRIVATE_NETWORK = 0x100,
129 static const struct option options[] = {
130 { "help", no_argument, NULL, 'h' },
131 { "directory", required_argument, NULL, 'D' },
132 { "user", required_argument, NULL, 'u' },
133 { "controllers", required_argument, NULL, 'C' },
134 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
135 { "boot", no_argument, NULL, 'b' },
136 { "uuid", required_argument, NULL, ARG_UUID },
137 { "read-only", no_argument, NULL, ARG_READ_ONLY },
138 { "capability", required_argument, NULL, ARG_CAPABILITY },
139 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
148 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158 arg_directory = canonicalize_file_name(optarg);
159 if (!arg_directory) {
160 log_error("Failed to canonicalize root directory.");
168 if (!(arg_user = strdup(optarg))) {
169 log_error("Failed to duplicate user name.");
176 strv_free(arg_controllers);
177 arg_controllers = strv_split(optarg, ",");
178 if (!arg_controllers) {
179 log_error("Failed to split controllers list.");
182 strv_uniq(arg_controllers);
186 case ARG_PRIVATE_NETWORK:
187 arg_private_network = true;
199 arg_read_only = true;
202 case ARG_CAPABILITY: {
206 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
210 t = strndup(word, length);
214 if (cap_from_name(t, &cap) < 0) {
215 log_error("Failed to parse capability %s.", t);
221 arg_retain |= 1ULL << (uint64_t) cap;
228 arg_link_journal = LINK_GUEST;
231 case ARG_LINK_JOURNAL:
232 if (streq(optarg, "auto"))
233 arg_link_journal = LINK_AUTO;
234 else if (streq(optarg, "no"))
235 arg_link_journal = LINK_NO;
236 else if (streq(optarg, "guest"))
237 arg_link_journal = LINK_GUEST;
238 else if (streq(optarg, "host"))
239 arg_link_journal = LINK_HOST;
241 log_error("Failed to parse link journal mode %s", optarg);
251 log_error("Unknown option code %c", c);
259 static int mount_all(const char *dest) {
261 typedef struct MountPoint {
270 static const MountPoint mount_table[] = {
271 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
272 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
277 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
280 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
281 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
288 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 char _cleanup_free_ *where = NULL;
292 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
301 t = path_is_mount_point(where, true);
303 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
315 mkdir_p_label(where, 0755);
317 if (mount(mount_table[k].what,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
324 log_error("mount(%s) failed: %m", where);
334 static int setup_timezone(const char *dest) {
335 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
341 /* Fix the timezone, if possible */
342 r = readlink_malloc("/etc/localtime", &p);
344 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
348 z = path_startswith(p, "../usr/share/zoneinfo/");
350 z = path_startswith(p, "/usr/share/zoneinfo/");
352 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
356 where = strappend(dest, "/etc/localtime");
360 r = readlink_malloc(where, &q);
362 y = path_startswith(q, "../usr/share/zoneinfo/");
364 y = path_startswith(q, "/usr/share/zoneinfo/");
367 /* Already pointing to the right place? Then do nothing .. */
368 if (y && streq(y, z))
372 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
376 if (access(check, F_OK) < 0) {
377 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
381 what = strappend("../usr/share/zoneinfo/", z);
386 if (symlink(what, where) < 0) {
387 log_error("Failed to correct timezone of container: %m");
394 static int setup_resolv_conf(const char *dest) {
399 if (arg_private_network)
402 /* Fix resolv.conf, if possible */
403 where = strappend(dest, "/etc/resolv.conf");
407 /* We don't really care for the results of this really. If it
408 * fails, it fails, but meh... */
409 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
410 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
417 static int setup_boot_id(const char *dest) {
418 char _cleanup_free_ *from = NULL, *to = NULL;
425 /* Generate a new randomized boot ID, so that each boot-up of
426 * the container gets a new one */
428 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
429 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
433 r = sd_id128_randomize(&rnd);
435 log_error("Failed to generate random boot id: %s", strerror(-r));
439 snprintf(as_uuid, sizeof(as_uuid),
440 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
441 SD_ID128_FORMAT_VAL(rnd));
442 char_array_0(as_uuid);
444 r = write_one_line_file(from, as_uuid);
446 log_error("Failed to write boot id: %s", strerror(-r));
450 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
451 log_error("Failed to bind mount boot id: %m");
454 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
460 static int copy_devnodes(const char *dest) {
462 static const char devnodes[] =
473 mode_t _cleanup_umask_ u;
479 NULSTR_FOREACH(d, devnodes) {
481 char _cleanup_free_ *from = NULL, *to = NULL;
483 asprintf(&from, "/dev/%s", d);
484 asprintf(&to, "%s/dev/%s", dest, d);
495 if (stat(from, &st) < 0) {
497 if (errno != ENOENT) {
498 log_error("Failed to stat %s: %m", from);
503 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
505 log_error("%s is not a char or block device, cannot copy", from);
509 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
511 log_error("mknod(%s) failed: %m", dest);
520 static int setup_dev_console(const char *dest, const char *console) {
522 char _cleanup_free_ *to = NULL;
524 mode_t _cleanup_umask_ u;
531 if (stat(console, &st) < 0) {
532 log_error("Failed to stat %s: %m", console);
535 } else if (!S_ISCHR(st.st_mode)) {
536 log_error("/dev/console is not a char device");
540 r = chmod_and_chown(console, 0600, 0, 0);
542 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
546 if (asprintf(&to, "%s/dev/console", dest) < 0)
549 /* We need to bind mount the right tty to /dev/console since
550 * ptys can only exist on pts file systems. To have something
551 * to bind mount things on we create a device node first, that
552 * has the right major/minor (note that the major minor
553 * doesn't actually matter here, since we mount it over
556 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
557 log_error("mknod() for /dev/console failed: %m");
561 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
562 log_error("Bind mount for /dev/console failed: %m");
569 static int setup_kmsg(const char *dest, int kmsg_socket) {
570 char _cleanup_free_ *from = NULL, *to = NULL;
572 mode_t _cleanup_umask_ u;
574 struct cmsghdr cmsghdr;
575 uint8_t buf[CMSG_SPACE(sizeof(int))];
578 struct cmsghdr *cmsg;
581 assert(kmsg_socket >= 0);
585 /* We create the kmsg FIFO as /dev/kmsg, but immediately
586 * delete it after bind mounting it to /proc/kmsg. While FIFOs
587 * on the reading side behave very similar to /proc/kmsg,
588 * their writing side behaves differently from /dev/kmsg in
589 * that writing blocks when nothing is reading. In order to
590 * avoid any problems with containers deadlocking due to this
591 * we simply make /dev/kmsg unavailable to the container. */
592 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
593 asprintf(&to, "%s/proc/kmsg", dest) < 0)
596 if (mkfifo(from, 0600) < 0) {
597 log_error("mkfifo() for /dev/kmsg failed: %m");
601 r = chmod_and_chown(from, 0600, 0, 0);
603 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
607 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
608 log_error("Bind mount for /proc/kmsg failed: %m");
612 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
614 log_error("Failed to open fifo: %m");
621 mh.msg_control = &control;
622 mh.msg_controllen = sizeof(control);
624 cmsg = CMSG_FIRSTHDR(&mh);
625 cmsg->cmsg_level = SOL_SOCKET;
626 cmsg->cmsg_type = SCM_RIGHTS;
627 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
628 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
630 mh.msg_controllen = cmsg->cmsg_len;
632 /* Store away the fd in the socket, so that it stays open as
633 * long as we run the child */
634 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
635 close_nointr_nofail(fd);
638 log_error("Failed to send FIFO fd: %m");
642 /* And now make the FIFO unavailable as /dev/kmsg... */
647 static int setup_hostname(void) {
651 hn = path_get_file_name(arg_directory);
657 hostname_cleanup(hn);
660 if (sethostname(hn, strlen(hn)) < 0)
669 static int setup_journal(const char *directory) {
670 sd_id128_t machine_id;
671 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
674 if (arg_link_journal == LINK_NO)
677 p = strappend(directory, "/etc/machine-id");
683 r = read_one_line_file(p, &b);
684 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
688 log_error("Failed to read machine ID: %s", strerror(-r));
693 if (isempty(l) && arg_link_journal == LINK_AUTO) {
698 /* Verify validaty */
699 r = sd_id128_from_string(l, &machine_id);
701 log_error("Failed to parse machine ID: %s", strerror(-r));
706 p = strappend("/var/log/journal/", l);
707 q = strjoin(directory, "/var/log/journal/", l, NULL);
713 if (path_is_mount_point(p, false) > 0 ||
714 path_is_mount_point(q, false) > 0) {
715 if (arg_link_journal != LINK_AUTO) {
716 log_error("Journal already a mount point, refusing.");
725 r = readlink_and_make_absolute(p, &d);
727 if ((arg_link_journal == LINK_GUEST ||
728 arg_link_journal == LINK_AUTO) &&
738 log_error("Failed to remove symlink %s: %m", p);
742 } else if (r == -EINVAL) {
744 if (arg_link_journal == LINK_GUEST &&
747 if (errno == ENOTDIR)
748 log_error("%s already exists and is neither symlink nor directory.", p);
750 log_error("Failed to remove %s: %m", p);
756 } else if (r != -ENOENT) {
757 log_error("readlink(%s) failed: %m", p);
761 if (arg_link_journal == LINK_GUEST) {
763 if (symlink(q, p) < 0) {
764 log_error("Failed to symlink %s to %s: %m", q, p);
775 if (arg_link_journal == LINK_HOST) {
776 r = mkdir_p(p, 0755);
778 log_error("Failed to create %s: %m", p);
782 } else if (access(p, F_OK) < 0) {
787 if (dir_is_empty(q) == 0) {
788 log_error("%s not empty.", q);
793 r = mkdir_p(q, 0755);
795 log_error("Failed to create %s: %m", q);
799 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
800 log_error("Failed to bind mount journal from host into guest: %m");
816 static int drop_capabilities(void) {
817 return capability_bounding_set_drop(~arg_retain, false);
820 static int is_os_tree(const char *path) {
823 /* We use /bin/sh as flag file if something is an OS */
825 if (asprintf(&p, "%s/bin/sh", path) < 0)
831 return r < 0 ? 0 : 1;
834 static int process_pty(int master, sigset_t *mask) {
836 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
837 size_t in_buffer_full = 0, out_buffer_full = 0;
838 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
839 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
840 int ep = -1, signal_fd = -1, r;
842 fd_nonblock(STDIN_FILENO, 1);
843 fd_nonblock(STDOUT_FILENO, 1);
844 fd_nonblock(master, 1);
846 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
848 log_error("signalfd(): %m");
853 ep = epoll_create1(EPOLL_CLOEXEC);
855 log_error("Failed to create epoll: %m");
861 stdin_ev.events = EPOLLIN|EPOLLET;
862 stdin_ev.data.fd = STDIN_FILENO;
865 stdout_ev.events = EPOLLOUT|EPOLLET;
866 stdout_ev.data.fd = STDOUT_FILENO;
869 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
870 master_ev.data.fd = master;
873 signal_ev.events = EPOLLIN;
874 signal_ev.data.fd = signal_fd;
876 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
877 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
878 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
879 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
880 log_error("Failed to regiser fds in epoll: %m");
886 struct epoll_event ev[16];
890 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
893 if (errno == EINTR || errno == EAGAIN)
896 log_error("epoll_wait(): %m");
903 for (i = 0; i < nfds; i++) {
904 if (ev[i].data.fd == STDIN_FILENO) {
906 if (ev[i].events & (EPOLLIN|EPOLLHUP))
907 stdin_readable = true;
909 } else if (ev[i].data.fd == STDOUT_FILENO) {
911 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
912 stdout_writable = true;
914 } else if (ev[i].data.fd == master) {
916 if (ev[i].events & (EPOLLIN|EPOLLHUP))
917 master_readable = true;
919 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
920 master_writable = true;
922 } else if (ev[i].data.fd == signal_fd) {
923 struct signalfd_siginfo sfsi;
926 n = read(signal_fd, &sfsi, sizeof(sfsi));
927 if (n != sizeof(sfsi)) {
930 log_error("Failed to read from signalfd: invalid block size");
935 if (errno != EINTR && errno != EAGAIN) {
936 log_error("Failed to read from signalfd: %m");
942 if (sfsi.ssi_signo == SIGWINCH) {
945 /* The window size changed, let's forward that. */
946 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
947 ioctl(master, TIOCSWINSZ, &ws);
956 while ((stdin_readable && in_buffer_full <= 0) ||
957 (master_writable && in_buffer_full > 0) ||
958 (master_readable && out_buffer_full <= 0) ||
959 (stdout_writable && out_buffer_full > 0)) {
961 if (stdin_readable && in_buffer_full < LINE_MAX) {
963 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
966 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
967 stdin_readable = false;
969 log_error("read(): %m");
974 in_buffer_full += (size_t) k;
977 if (master_writable && in_buffer_full > 0) {
979 k = write(master, in_buffer, in_buffer_full);
982 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
983 master_writable = false;
985 log_error("write(): %m");
991 assert(in_buffer_full >= (size_t) k);
992 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
997 if (master_readable && out_buffer_full < LINE_MAX) {
999 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1002 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1003 master_readable = false;
1005 log_error("read(): %m");
1010 out_buffer_full += (size_t) k;
1013 if (stdout_writable && out_buffer_full > 0) {
1015 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1018 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1019 stdout_writable = false;
1021 log_error("write(): %m");
1027 assert(out_buffer_full >= (size_t) k);
1028 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1029 out_buffer_full -= k;
1037 close_nointr_nofail(ep);
1040 close_nointr_nofail(signal_fd);
1045 int main(int argc, char *argv[]) {
1047 int r = EXIT_FAILURE, k;
1048 char *oldcg = NULL, *newcg = NULL;
1049 char **controller = NULL;
1051 const char *console = NULL;
1052 struct termios saved_attr, raw_attr;
1054 bool saved_attr_valid = false;
1056 int kmsg_socket_pair[2] = { -1, -1 };
1058 log_parse_environment();
1061 r = parse_argv(argc, argv);
1065 if (arg_directory) {
1068 p = path_make_absolute_cwd(arg_directory);
1069 free(arg_directory);
1072 arg_directory = get_current_dir_name();
1074 if (!arg_directory) {
1075 log_error("Failed to determine path");
1079 path_kill_slashes(arg_directory);
1081 if (geteuid() != 0) {
1082 log_error("Need to be root.");
1086 if (sd_booted() <= 0) {
1087 log_error("Not running on a systemd system.");
1091 if (path_equal(arg_directory, "/")) {
1092 log_error("Spawning container on root directory not supported.");
1096 if (is_os_tree(arg_directory) <= 0) {
1097 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1101 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1103 log_error("Failed to determine current cgroup: %s", strerror(-k));
1107 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1108 log_error("Failed to allocate cgroup path.");
1112 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1114 log_error("Failed to create cgroup: %s", strerror(-k));
1118 STRV_FOREACH(controller, arg_controllers) {
1119 k = cg_create_and_attach(*controller, newcg, 0);
1121 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1124 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1126 log_error("Failed to acquire pseudo tty: %m");
1130 console = ptsname(master);
1132 log_error("Failed to determine tty name: %m");
1136 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1138 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1139 ioctl(master, TIOCSWINSZ, &ws);
1141 if (unlockpt(master) < 0) {
1142 log_error("Failed to unlock tty: %m");
1146 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1147 log_error("Failed to get terminal attributes: %m");
1151 saved_attr_valid = true;
1153 raw_attr = saved_attr;
1154 cfmakeraw(&raw_attr);
1155 raw_attr.c_lflag &= ~ECHO;
1157 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1158 log_error("Failed to create kmsg socket pair");
1162 assert_se(sigemptyset(&mask) == 0);
1163 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1164 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1169 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1170 log_error("Failed to set terminal attributes: %m");
1174 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1176 if (errno == EINVAL)
1177 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1179 log_error("clone() failed: %m");
1187 const char *home = NULL;
1188 uid_t uid = (uid_t) -1;
1189 gid_t gid = (gid_t) -1;
1190 const char *envp[] = {
1191 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1192 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1197 NULL, /* container_uuid */
1201 envp[2] = strv_find_prefix(environ, "TERM=");
1203 close_nointr_nofail(master);
1205 close_nointr(STDIN_FILENO);
1206 close_nointr(STDOUT_FILENO);
1207 close_nointr(STDERR_FILENO);
1209 close_all_fds(&kmsg_socket_pair[1], 1);
1211 reset_all_signal_handlers();
1213 assert_se(sigemptyset(&mask) == 0);
1214 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1216 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1217 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1218 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1222 log_error("setsid() failed: %m");
1226 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1227 log_error("PR_SET_PDEATHSIG failed: %m");
1231 /* Mark everything as slave, so that we still
1232 * receive mounts from the real root, but don't
1233 * propagate mounts to the real root. */
1234 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1235 log_error("MS_SLAVE|MS_REC failed: %m");
1239 /* Turn directory into bind mount */
1240 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1241 log_error("Failed to make bind mount.");
1246 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1247 log_error("Failed to make read-only.");
1251 if (mount_all(arg_directory) < 0)
1254 if (copy_devnodes(arg_directory) < 0)
1257 dev_setup(arg_directory);
1259 if (setup_dev_console(arg_directory, console) < 0)
1262 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1265 close_nointr_nofail(kmsg_socket_pair[1]);
1267 if (setup_boot_id(arg_directory) < 0)
1270 if (setup_timezone(arg_directory) < 0)
1273 if (setup_resolv_conf(arg_directory) < 0)
1276 if (setup_journal(arg_directory) < 0)
1279 if (chdir(arg_directory) < 0) {
1280 log_error("chdir(%s) failed: %m", arg_directory);
1284 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1285 log_error("mount(MS_MOVE) failed: %m");
1289 if (chroot(".") < 0) {
1290 log_error("chroot() failed: %m");
1294 if (chdir("/") < 0) {
1295 log_error("chdir() failed: %m");
1303 if (drop_capabilities() < 0) {
1304 log_error("drop_capabilities() failed: %m");
1310 /* Note that this resolves user names
1311 * inside the container, and hence
1312 * accesses the NSS modules from the
1313 * container and not the host. This is
1316 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1317 log_error("get_user_creds() failed: %m");
1321 if (mkdir_parents_label(home, 0775) < 0) {
1322 log_error("mkdir_parents_label() failed: %m");
1326 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1327 log_error("mkdir_safe_label() failed: %m");
1331 if (initgroups((const char*)arg_user, gid) < 0) {
1332 log_error("initgroups() failed: %m");
1336 if (setresgid(gid, gid, gid) < 0) {
1337 log_error("setregid() failed: %m");
1341 if (setresuid(uid, uid, uid) < 0) {
1342 log_error("setreuid() failed: %m");
1347 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1348 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1349 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1355 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1367 /* Automatically search for the init system */
1369 l = 1 + argc - optind;
1370 a = newa(char*, l + 1);
1371 memcpy(a + 1, argv + optind, l * sizeof(char*));
1373 a[0] = (char*) "/usr/lib/systemd/systemd";
1374 execve(a[0], a, (char**) envp);
1376 a[0] = (char*) "/lib/systemd/systemd";
1377 execve(a[0], a, (char**) envp);
1379 a[0] = (char*) "/sbin/init";
1380 execve(a[0], a, (char**) envp);
1381 } else if (argc > optind)
1382 execvpe(argv[optind], argv + optind, (char**) envp);
1384 chdir(home ? home : "/root");
1385 execle("/bin/bash", "-bash", NULL, (char**) envp);
1388 log_error("execv() failed: %m");
1391 _exit(EXIT_FAILURE);
1394 if (process_pty(master, &mask) < 0)
1398 if (saved_attr_valid)
1399 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1401 r = wait_for_terminate(pid, &status);
1407 if (status.si_code == CLD_EXITED) {
1408 if (status.si_status != 0) {
1409 log_error("Container failed with error code %i.", status.si_status);
1410 r = status.si_status;
1414 log_debug("Container exited successfully.");
1416 } else if (status.si_code == CLD_KILLED &&
1417 status.si_status == SIGINT) {
1418 log_info("Container has been shut down.");
1421 } else if (status.si_code == CLD_KILLED &&
1422 status.si_status == SIGHUP) {
1423 log_info("Container is being rebooted.");
1425 } else if (status.si_code == CLD_KILLED ||
1426 status.si_code == CLD_DUMPED) {
1428 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1432 log_error("Container failed due to unknown reason.");
1439 if (saved_attr_valid)
1440 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1443 close_nointr_nofail(master);
1445 close_pipe(kmsg_socket_pair);
1448 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1451 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1453 free(arg_directory);
1454 strv_free(arg_controllers);