1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
61 typedef enum LinkJournal {
68 static char *arg_directory = NULL;
69 static char *arg_user = NULL;
70 static char **arg_controllers = NULL;
71 static char *arg_uuid = NULL;
72 static bool arg_private_network = false;
73 static bool arg_read_only = false;
74 static bool arg_boot = false;
75 static LinkJournal arg_link_journal = LINK_AUTO;
76 static uint64_t arg_retain =
78 (1ULL << CAP_DAC_OVERRIDE) |
79 (1ULL << CAP_DAC_READ_SEARCH) |
80 (1ULL << CAP_FOWNER) |
81 (1ULL << CAP_FSETID) |
82 (1ULL << CAP_IPC_OWNER) |
85 (1ULL << CAP_LINUX_IMMUTABLE) |
86 (1ULL << CAP_NET_BIND_SERVICE) |
87 (1ULL << CAP_NET_BROADCAST) |
88 (1ULL << CAP_NET_RAW) |
89 (1ULL << CAP_SETGID) |
90 (1ULL << CAP_SETFCAP) |
91 (1ULL << CAP_SETPCAP) |
92 (1ULL << CAP_SETUID) |
93 (1ULL << CAP_SYS_ADMIN) |
94 (1ULL << CAP_SYS_CHROOT) |
95 (1ULL << CAP_SYS_NICE) |
96 (1ULL << CAP_SYS_PTRACE) |
97 (1ULL << CAP_SYS_TTY_CONFIG) |
98 (1ULL << CAP_SYS_RESOURCE) |
99 (1ULL << CAP_SYS_BOOT) |
100 (1ULL << CAP_AUDIT_WRITE) |
101 (1ULL << CAP_AUDIT_CONTROL);
103 static int help(void) {
105 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
106 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
107 " -h --help Show this help\n"
108 " --version Print version string\n"
109 " -D --directory=NAME Root directory for the container\n"
110 " -b --boot Boot up full system (i.e. invoke init)\n"
111 " -u --user=USER Run the command under specified user or uid\n"
112 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
113 " --uuid=UUID Set a specific machine UUID for the container\n"
114 " --private-network Disable network in container\n"
115 " --read-only Mount the root directory read-only\n"
116 " --capability=CAP In addition to the default, retain specified capability\n"
117 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
118 " -j Equivalent to --link-journal=host\n",
119 program_invocation_short_name);
124 static int parse_argv(int argc, char *argv[]) {
135 static const struct option options[] = {
136 { "help", no_argument, NULL, 'h' },
137 { "version", no_argument, NULL, ARG_VERSION },
138 { "directory", required_argument, NULL, 'D' },
139 { "user", required_argument, NULL, 'u' },
140 { "controllers", required_argument, NULL, 'C' },
141 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
142 { "boot", no_argument, NULL, 'b' },
143 { "uuid", required_argument, NULL, ARG_UUID },
144 { "read-only", no_argument, NULL, ARG_READ_ONLY },
145 { "capability", required_argument, NULL, ARG_CAPABILITY },
146 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
155 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
164 puts(PACKAGE_STRING);
165 puts(SYSTEMD_FEATURES);
170 arg_directory = canonicalize_file_name(optarg);
171 if (!arg_directory) {
172 log_error("Failed to canonicalize root directory.");
180 if (!(arg_user = strdup(optarg))) {
181 log_error("Failed to duplicate user name.");
188 strv_free(arg_controllers);
189 arg_controllers = strv_split(optarg, ",");
190 if (!arg_controllers) {
191 log_error("Failed to split controllers list.");
194 strv_uniq(arg_controllers);
198 case ARG_PRIVATE_NETWORK:
199 arg_private_network = true;
211 arg_read_only = true;
214 case ARG_CAPABILITY: {
218 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
222 t = strndup(word, length);
226 if (cap_from_name(t, &cap) < 0) {
227 log_error("Failed to parse capability %s.", t);
233 arg_retain |= 1ULL << (uint64_t) cap;
240 arg_link_journal = LINK_GUEST;
243 case ARG_LINK_JOURNAL:
244 if (streq(optarg, "auto"))
245 arg_link_journal = LINK_AUTO;
246 else if (streq(optarg, "no"))
247 arg_link_journal = LINK_NO;
248 else if (streq(optarg, "guest"))
249 arg_link_journal = LINK_GUEST;
250 else if (streq(optarg, "host"))
251 arg_link_journal = LINK_HOST;
253 log_error("Failed to parse link journal mode %s", optarg);
263 log_error("Unknown option code %c", c);
271 static int mount_all(const char *dest) {
273 typedef struct MountPoint {
282 static const MountPoint mount_table[] = {
283 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
284 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
285 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
286 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
287 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
288 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
289 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
290 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
292 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
293 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
300 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
301 char _cleanup_free_ *where = NULL;
304 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
313 t = path_is_mount_point(where, true);
315 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
323 /* Skip this entry if it is not a remount. */
324 if (mount_table[k].what && t > 0)
327 mkdir_p_label(where, 0755);
329 if (mount(mount_table[k].what,
332 mount_table[k].flags,
333 mount_table[k].options) < 0 &&
334 mount_table[k].fatal) {
336 log_error("mount(%s) failed: %m", where);
346 static int setup_timezone(const char *dest) {
347 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
353 /* Fix the timezone, if possible */
354 r = readlink_malloc("/etc/localtime", &p);
356 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
360 z = path_startswith(p, "../usr/share/zoneinfo/");
362 z = path_startswith(p, "/usr/share/zoneinfo/");
364 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
368 where = strappend(dest, "/etc/localtime");
372 r = readlink_malloc(where, &q);
374 y = path_startswith(q, "../usr/share/zoneinfo/");
376 y = path_startswith(q, "/usr/share/zoneinfo/");
379 /* Already pointing to the right place? Then do nothing .. */
380 if (y && streq(y, z))
384 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
388 if (access(check, F_OK) < 0) {
389 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
393 what = strappend("../usr/share/zoneinfo/", z);
398 if (symlink(what, where) < 0) {
399 log_error("Failed to correct timezone of container: %m");
406 static int setup_resolv_conf(const char *dest) {
411 if (arg_private_network)
414 /* Fix resolv.conf, if possible */
415 where = strappend(dest, "/etc/resolv.conf");
419 /* We don't really care for the results of this really. If it
420 * fails, it fails, but meh... */
421 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
422 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
429 static int setup_boot_id(const char *dest) {
430 char _cleanup_free_ *from = NULL, *to = NULL;
437 /* Generate a new randomized boot ID, so that each boot-up of
438 * the container gets a new one */
440 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
441 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
445 r = sd_id128_randomize(&rnd);
447 log_error("Failed to generate random boot id: %s", strerror(-r));
451 snprintf(as_uuid, sizeof(as_uuid),
452 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
453 SD_ID128_FORMAT_VAL(rnd));
454 char_array_0(as_uuid);
456 r = write_one_line_file(from, as_uuid);
458 log_error("Failed to write boot id: %s", strerror(-r));
462 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
463 log_error("Failed to bind mount boot id: %m");
466 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
472 static int copy_devnodes(const char *dest) {
474 static const char devnodes[] =
485 mode_t _cleanup_umask_ u;
491 NULSTR_FOREACH(d, devnodes) {
493 char _cleanup_free_ *from = NULL, *to = NULL;
495 asprintf(&from, "/dev/%s", d);
496 asprintf(&to, "%s/dev/%s", dest, d);
507 if (stat(from, &st) < 0) {
509 if (errno != ENOENT) {
510 log_error("Failed to stat %s: %m", from);
515 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
517 log_error("%s is not a char or block device, cannot copy", from);
521 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
523 log_error("mknod(%s) failed: %m", dest);
532 static int setup_dev_console(const char *dest, const char *console) {
534 char _cleanup_free_ *to = NULL;
536 mode_t _cleanup_umask_ u;
543 if (stat(console, &st) < 0) {
544 log_error("Failed to stat %s: %m", console);
547 } else if (!S_ISCHR(st.st_mode)) {
548 log_error("/dev/console is not a char device");
552 r = chmod_and_chown(console, 0600, 0, 0);
554 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
558 if (asprintf(&to, "%s/dev/console", dest) < 0)
561 /* We need to bind mount the right tty to /dev/console since
562 * ptys can only exist on pts file systems. To have something
563 * to bind mount things on we create a device node first, that
564 * has the right major/minor (note that the major minor
565 * doesn't actually matter here, since we mount it over
568 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
569 log_error("mknod() for /dev/console failed: %m");
573 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
574 log_error("Bind mount for /dev/console failed: %m");
581 static int setup_kmsg(const char *dest, int kmsg_socket) {
582 char _cleanup_free_ *from = NULL, *to = NULL;
584 mode_t _cleanup_umask_ u;
586 struct cmsghdr cmsghdr;
587 uint8_t buf[CMSG_SPACE(sizeof(int))];
590 struct cmsghdr *cmsg;
593 assert(kmsg_socket >= 0);
597 /* We create the kmsg FIFO as /dev/kmsg, but immediately
598 * delete it after bind mounting it to /proc/kmsg. While FIFOs
599 * on the reading side behave very similar to /proc/kmsg,
600 * their writing side behaves differently from /dev/kmsg in
601 * that writing blocks when nothing is reading. In order to
602 * avoid any problems with containers deadlocking due to this
603 * we simply make /dev/kmsg unavailable to the container. */
604 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
605 asprintf(&to, "%s/proc/kmsg", dest) < 0)
608 if (mkfifo(from, 0600) < 0) {
609 log_error("mkfifo() for /dev/kmsg failed: %m");
613 r = chmod_and_chown(from, 0600, 0, 0);
615 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
619 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
620 log_error("Bind mount for /proc/kmsg failed: %m");
624 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
626 log_error("Failed to open fifo: %m");
633 mh.msg_control = &control;
634 mh.msg_controllen = sizeof(control);
636 cmsg = CMSG_FIRSTHDR(&mh);
637 cmsg->cmsg_level = SOL_SOCKET;
638 cmsg->cmsg_type = SCM_RIGHTS;
639 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
640 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
642 mh.msg_controllen = cmsg->cmsg_len;
644 /* Store away the fd in the socket, so that it stays open as
645 * long as we run the child */
646 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
647 close_nointr_nofail(fd);
650 log_error("Failed to send FIFO fd: %m");
654 /* And now make the FIFO unavailable as /dev/kmsg... */
659 static int setup_hostname(void) {
663 hn = path_get_file_name(arg_directory);
669 hostname_cleanup(hn);
672 if (sethostname(hn, strlen(hn)) < 0)
681 static int setup_journal(const char *directory) {
682 sd_id128_t machine_id;
683 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
687 if (arg_link_journal == LINK_NO)
690 p = strappend(directory, "/etc/machine-id");
694 r = read_one_line_file(p, &b);
695 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
698 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
703 if (isempty(id) && arg_link_journal == LINK_AUTO)
706 /* Verify validity */
707 r = sd_id128_from_string(id, &machine_id);
709 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
714 p = strappend("/var/log/journal/", id);
715 q = strjoin(directory, "/var/log/journal/", id, NULL);
719 if (path_is_mount_point(p, false) > 0) {
720 if (arg_link_journal != LINK_AUTO) {
721 log_error("%s: already a mount point, refusing to use for journal", p);
728 if (path_is_mount_point(q, false) > 0) {
729 if (arg_link_journal != LINK_AUTO) {
730 log_error("%s: already a mount point, refusing to use for journal", q);
737 r = readlink_and_make_absolute(p, &d);
739 if ((arg_link_journal == LINK_GUEST ||
740 arg_link_journal == LINK_AUTO) &&
743 r = mkdir_p(q, 0755);
745 log_warning("failed to create directory %s: %m", q);
750 log_error("Failed to remove symlink %s: %m", p);
753 } else if (r == -EINVAL) {
755 if (arg_link_journal == LINK_GUEST &&
758 if (errno == ENOTDIR) {
759 log_error("%s already exists and is neither a symlink nor a directory", p);
762 log_error("Failed to remove %s: %m", p);
766 } else if (r != -ENOENT) {
767 log_error("readlink(%s) failed: %m", p);
771 if (arg_link_journal == LINK_GUEST) {
773 if (symlink(q, p) < 0) {
774 log_error("Failed to symlink %s to %s: %m", q, p);
778 r = mkdir_p(q, 0755);
780 log_warning("failed to create directory %s: %m", q);
784 if (arg_link_journal == LINK_HOST) {
785 r = mkdir_p(p, 0755);
787 log_error("Failed to create %s: %m", p);
791 } else if (access(p, F_OK) < 0)
794 if (dir_is_empty(q) == 0) {
795 log_error("%s not empty.", q);
799 r = mkdir_p(q, 0755);
801 log_error("Failed to create %s: %m", q);
805 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
806 log_error("Failed to bind mount journal from host into guest: %m");
813 static int drop_capabilities(void) {
814 return capability_bounding_set_drop(~arg_retain, false);
817 static int is_os_tree(const char *path) {
820 /* We use /bin/sh as flag file if something is an OS */
822 if (asprintf(&p, "%s/bin/sh", path) < 0)
828 return r < 0 ? 0 : 1;
831 static int process_pty(int master, pid_t pid, sigset_t *mask) {
833 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
834 size_t in_buffer_full = 0, out_buffer_full = 0;
835 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
836 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
837 int ep = -1, signal_fd = -1, r;
838 bool tried_orderly_shutdown = false;
844 fd_nonblock(STDIN_FILENO, 1);
845 fd_nonblock(STDOUT_FILENO, 1);
846 fd_nonblock(master, 1);
848 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
850 log_error("signalfd(): %m");
855 ep = epoll_create1(EPOLL_CLOEXEC);
857 log_error("Failed to create epoll: %m");
862 /* We read from STDIN only if this is actually a TTY,
863 * otherwise we assume non-interactivity. */
864 if (isatty(STDIN_FILENO)) {
866 stdin_ev.events = EPOLLIN|EPOLLET;
867 stdin_ev.data.fd = STDIN_FILENO;
869 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
870 log_error("Failed to register STDIN in epoll: %m");
877 stdout_ev.events = EPOLLOUT|EPOLLET;
878 stdout_ev.data.fd = STDOUT_FILENO;
881 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
882 master_ev.data.fd = master;
885 signal_ev.events = EPOLLIN;
886 signal_ev.data.fd = signal_fd;
888 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
889 if (errno != EPERM) {
890 log_error("Failed to register stdout in epoll: %m");
894 /* stdout without epoll support. Likely redirected to regular file. */
895 stdout_writable = true;
898 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
899 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
900 log_error("Failed to register fds in epoll: %m");
906 struct epoll_event ev[16];
910 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
913 if (errno == EINTR || errno == EAGAIN)
916 log_error("epoll_wait(): %m");
923 for (i = 0; i < nfds; i++) {
924 if (ev[i].data.fd == STDIN_FILENO) {
926 if (ev[i].events & (EPOLLIN|EPOLLHUP))
927 stdin_readable = true;
929 } else if (ev[i].data.fd == STDOUT_FILENO) {
931 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
932 stdout_writable = true;
934 } else if (ev[i].data.fd == master) {
936 if (ev[i].events & (EPOLLIN|EPOLLHUP))
937 master_readable = true;
939 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
940 master_writable = true;
942 } else if (ev[i].data.fd == signal_fd) {
943 struct signalfd_siginfo sfsi;
946 n = read(signal_fd, &sfsi, sizeof(sfsi));
947 if (n != sizeof(sfsi)) {
950 log_error("Failed to read from signalfd: invalid block size");
955 if (errno != EINTR && errno != EAGAIN) {
956 log_error("Failed to read from signalfd: %m");
962 if (sfsi.ssi_signo == SIGWINCH) {
965 /* The window size changed, let's forward that. */
966 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
967 ioctl(master, TIOCSWINSZ, &ws);
968 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
970 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
972 /* This only works for systemd... */
973 tried_orderly_shutdown = true;
974 kill(pid, SIGRTMIN+3);
984 while ((stdin_readable && in_buffer_full <= 0) ||
985 (master_writable && in_buffer_full > 0) ||
986 (master_readable && out_buffer_full <= 0) ||
987 (stdout_writable && out_buffer_full > 0)) {
989 if (stdin_readable && in_buffer_full < LINE_MAX) {
991 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
994 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
995 stdin_readable = false;
997 log_error("read(): %m");
1002 in_buffer_full += (size_t) k;
1005 if (master_writable && in_buffer_full > 0) {
1007 k = write(master, in_buffer, in_buffer_full);
1010 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1011 master_writable = false;
1013 log_error("write(): %m");
1019 assert(in_buffer_full >= (size_t) k);
1020 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1021 in_buffer_full -= k;
1025 if (master_readable && out_buffer_full < LINE_MAX) {
1027 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1030 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1031 master_readable = false;
1033 log_error("read(): %m");
1038 out_buffer_full += (size_t) k;
1041 if (stdout_writable && out_buffer_full > 0) {
1043 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1046 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1047 stdout_writable = false;
1049 log_error("write(): %m");
1055 assert(out_buffer_full >= (size_t) k);
1056 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1057 out_buffer_full -= k;
1065 close_nointr_nofail(ep);
1068 close_nointr_nofail(signal_fd);
1073 int main(int argc, char *argv[]) {
1075 int r = EXIT_FAILURE, k;
1076 char *oldcg = NULL, *newcg = NULL;
1077 char **controller = NULL;
1078 int master = -1, n_fd_passed;
1079 const char *console = NULL;
1080 struct termios saved_attr, raw_attr;
1082 bool saved_attr_valid = false;
1084 int kmsg_socket_pair[2] = { -1, -1 };
1087 log_parse_environment();
1090 r = parse_argv(argc, argv);
1094 if (arg_directory) {
1097 p = path_make_absolute_cwd(arg_directory);
1098 free(arg_directory);
1101 arg_directory = get_current_dir_name();
1103 if (!arg_directory) {
1104 log_error("Failed to determine path");
1108 path_kill_slashes(arg_directory);
1110 if (geteuid() != 0) {
1111 log_error("Need to be root.");
1115 if (sd_booted() <= 0) {
1116 log_error("Not running on a systemd system.");
1120 if (path_equal(arg_directory, "/")) {
1121 log_error("Spawning container on root directory not supported.");
1125 if (is_os_tree(arg_directory) <= 0) {
1126 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1131 n_fd_passed = sd_listen_fds(false);
1132 if (n_fd_passed > 0) {
1133 k = fdset_new_listen_fds(&fds, false);
1135 log_error("Failed to collect file descriptors: %s", strerror(-k));
1139 fdset_close_others(fds);
1142 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1144 log_error("Failed to determine current cgroup: %s", strerror(-k));
1148 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1149 log_error("Failed to allocate cgroup path.");
1153 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1155 log_error("Failed to create cgroup: %s", strerror(-k));
1159 STRV_FOREACH(controller, arg_controllers) {
1160 k = cg_create_and_attach(*controller, newcg, 0);
1162 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1165 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1167 log_error("Failed to acquire pseudo tty: %m");
1171 console = ptsname(master);
1173 log_error("Failed to determine tty name: %m");
1177 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1179 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1180 ioctl(master, TIOCSWINSZ, &ws);
1182 if (unlockpt(master) < 0) {
1183 log_error("Failed to unlock tty: %m");
1187 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1188 saved_attr_valid = true;
1190 raw_attr = saved_attr;
1191 cfmakeraw(&raw_attr);
1192 raw_attr.c_lflag &= ~ECHO;
1195 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1196 log_error("Failed to create kmsg socket pair");
1200 assert_se(sigemptyset(&mask) == 0);
1201 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1202 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1207 if (saved_attr_valid) {
1208 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1209 log_error("Failed to set terminal attributes: %m");
1214 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1216 if (errno == EINVAL)
1217 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1219 log_error("clone() failed: %m");
1227 const char *home = NULL;
1228 uid_t uid = (uid_t) -1;
1229 gid_t gid = (gid_t) -1;
1231 const char *envp[] = {
1232 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1233 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1238 NULL, /* container_uuid */
1239 NULL, /* LISTEN_FDS */
1240 NULL, /* LISTEN_PID */
1244 envp[2] = strv_find_prefix(environ, "TERM=");
1247 close_nointr_nofail(master);
1250 close_nointr(STDIN_FILENO);
1251 close_nointr(STDOUT_FILENO);
1252 close_nointr(STDERR_FILENO);
1254 close_nointr_nofail(kmsg_socket_pair[0]);
1255 kmsg_socket_pair[0] = -1;
1257 reset_all_signal_handlers();
1259 assert_se(sigemptyset(&mask) == 0);
1260 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1262 k = open_terminal(console, O_RDWR);
1263 if (k != STDIN_FILENO) {
1265 close_nointr_nofail(k);
1269 log_error("Failed to open console: %s", strerror(-k));
1273 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1274 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1275 log_error("Failed to duplicate console: %m");
1280 log_error("setsid() failed: %m");
1284 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1285 log_error("PR_SET_PDEATHSIG failed: %m");
1289 /* Mark everything as slave, so that we still
1290 * receive mounts from the real root, but don't
1291 * propagate mounts to the real root. */
1292 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1293 log_error("MS_SLAVE|MS_REC failed: %m");
1297 /* Turn directory into bind mount */
1298 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1299 log_error("Failed to make bind mount.");
1304 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1305 log_error("Failed to make read-only.");
1309 if (mount_all(arg_directory) < 0)
1312 if (copy_devnodes(arg_directory) < 0)
1315 dev_setup(arg_directory);
1317 if (setup_dev_console(arg_directory, console) < 0)
1320 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1323 close_nointr_nofail(kmsg_socket_pair[1]);
1324 kmsg_socket_pair[1] = -1;
1326 if (setup_boot_id(arg_directory) < 0)
1329 if (setup_timezone(arg_directory) < 0)
1332 if (setup_resolv_conf(arg_directory) < 0)
1335 if (setup_journal(arg_directory) < 0)
1338 if (chdir(arg_directory) < 0) {
1339 log_error("chdir(%s) failed: %m", arg_directory);
1343 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1344 log_error("mount(MS_MOVE) failed: %m");
1348 if (chroot(".") < 0) {
1349 log_error("chroot() failed: %m");
1353 if (chdir("/") < 0) {
1354 log_error("chdir() failed: %m");
1362 if (drop_capabilities() < 0) {
1363 log_error("drop_capabilities() failed: %m");
1369 /* Note that this resolves user names
1370 * inside the container, and hence
1371 * accesses the NSS modules from the
1372 * container and not the host. This is
1375 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1376 log_error("get_user_creds() failed: %m");
1380 if (mkdir_parents_label(home, 0775) < 0) {
1381 log_error("mkdir_parents_label() failed: %m");
1385 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1386 log_error("mkdir_safe_label() failed: %m");
1390 if (initgroups((const char*)arg_user, gid) < 0) {
1391 log_error("initgroups() failed: %m");
1395 if (setresgid(gid, gid, gid) < 0) {
1396 log_error("setregid() failed: %m");
1400 if (setresuid(uid, uid, uid) < 0) {
1401 log_error("setreuid() failed: %m");
1405 /* Reset everything fully to 0, just in case */
1407 if (setgroups(0, NULL) < 0) {
1408 log_error("setgroups() failed: %m");
1412 if (setresgid(0, 0, 0) < 0) {
1413 log_error("setregid() failed: %m");
1417 if (setresuid(0, 0, 0) < 0) {
1418 log_error("setreuid() failed: %m");
1423 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1424 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1425 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1431 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1437 if (fdset_size(fds) > 0) {
1438 k = fdset_cloexec(fds, false);
1440 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1444 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1445 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1457 /* Automatically search for the init system */
1459 l = 1 + argc - optind;
1460 a = newa(char*, l + 1);
1461 memcpy(a + 1, argv + optind, l * sizeof(char*));
1463 a[0] = (char*) "/usr/lib/systemd/systemd";
1464 execve(a[0], a, (char**) envp);
1466 a[0] = (char*) "/lib/systemd/systemd";
1467 execve(a[0], a, (char**) envp);
1469 a[0] = (char*) "/sbin/init";
1470 execve(a[0], a, (char**) envp);
1471 } else if (argc > optind)
1472 execvpe(argv[optind], argv + optind, (char**) envp);
1474 chdir(home ? home : "/root");
1475 execle("/bin/bash", "-bash", NULL, (char**) envp);
1478 log_error("execv() failed: %m");
1481 _exit(EXIT_FAILURE);
1487 if (process_pty(master, pid, &mask) < 0)
1490 if (saved_attr_valid)
1491 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1493 r = wait_for_terminate(pid, &status);
1499 if (status.si_code == CLD_EXITED) {
1500 if (status.si_status != 0) {
1501 log_error("Container failed with error code %i.", status.si_status);
1502 r = status.si_status;
1506 log_debug("Container exited successfully.");
1508 } else if (status.si_code == CLD_KILLED &&
1509 status.si_status == SIGINT) {
1510 log_info("Container has been shut down.");
1513 } else if (status.si_code == CLD_KILLED &&
1514 status.si_status == SIGHUP) {
1515 log_info("Container is being rebooted.");
1517 } else if (status.si_code == CLD_KILLED ||
1518 status.si_code == CLD_DUMPED) {
1520 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1524 log_error("Container failed due to unknown reason.");
1531 if (saved_attr_valid)
1532 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1535 close_nointr_nofail(master);
1537 close_pipe(kmsg_socket_pair);
1540 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1543 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1545 free(arg_directory);
1546 strv_free(arg_controllers);