1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
67 typedef enum LinkJournal {
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static char *arg_machine = NULL;
79 static bool arg_private_network = false;
80 static bool arg_read_only = false;
81 static bool arg_boot = false;
82 static LinkJournal arg_link_journal = LINK_AUTO;
83 static uint64_t arg_retain =
85 (1ULL << CAP_DAC_OVERRIDE) |
86 (1ULL << CAP_DAC_READ_SEARCH) |
87 (1ULL << CAP_FOWNER) |
88 (1ULL << CAP_FSETID) |
89 (1ULL << CAP_IPC_OWNER) |
92 (1ULL << CAP_LINUX_IMMUTABLE) |
93 (1ULL << CAP_NET_BIND_SERVICE) |
94 (1ULL << CAP_NET_BROADCAST) |
95 (1ULL << CAP_NET_RAW) |
96 (1ULL << CAP_SETGID) |
97 (1ULL << CAP_SETFCAP) |
98 (1ULL << CAP_SETPCAP) |
99 (1ULL << CAP_SETUID) |
100 (1ULL << CAP_SYS_ADMIN) |
101 (1ULL << CAP_SYS_CHROOT) |
102 (1ULL << CAP_SYS_NICE) |
103 (1ULL << CAP_SYS_PTRACE) |
104 (1ULL << CAP_SYS_TTY_CONFIG) |
105 (1ULL << CAP_SYS_RESOURCE) |
106 (1ULL << CAP_SYS_BOOT) |
107 (1ULL << CAP_AUDIT_WRITE) |
108 (1ULL << CAP_AUDIT_CONTROL);
109 static char **arg_bind = NULL;
110 static char **arg_bind_ro = NULL;
112 static int help(void) {
114 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
115 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
116 " -h --help Show this help\n"
117 " --version Print version string\n"
118 " -D --directory=NAME Root directory for the container\n"
119 " -b --boot Boot up full system (i.e. invoke init)\n"
120 " -u --user=USER Run the command under specified user or uid\n"
121 " -C --controllers=LIST Put the container in specified comma-separated\n"
122 " cgroup hierarchies\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " --private-network Disable network in container\n"
126 " --read-only Mount the root directory read-only\n"
127 " --capability=CAP In addition to the default, retain specified\n"
129 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
130 " -j Equivalent to --link-journal=host\n"
131 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
133 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
134 program_invocation_short_name);
139 static int parse_argv(int argc, char *argv[]) {
152 static const struct option options[] = {
153 { "help", no_argument, NULL, 'h' },
154 { "version", no_argument, NULL, ARG_VERSION },
155 { "directory", required_argument, NULL, 'D' },
156 { "user", required_argument, NULL, 'u' },
157 { "controllers", required_argument, NULL, 'C' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
159 { "boot", no_argument, NULL, 'b' },
160 { "uuid", required_argument, NULL, ARG_UUID },
161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
162 { "capability", required_argument, NULL, ARG_CAPABILITY },
163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
166 { "machine", required_argument, NULL, 'M' },
175 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
200 arg_user = strdup(optarg);
207 strv_free(arg_controllers);
208 arg_controllers = strv_split(optarg, ",");
209 if (!arg_controllers)
212 cg_shorten_controllers(arg_controllers);
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
228 if (!hostname_is_valid(optarg)) {
229 log_error("Invalid machine name: %s", optarg);
234 arg_machine = strdup(optarg);
241 arg_read_only = true;
244 case ARG_CAPABILITY: {
248 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
252 t = strndup(word, length);
256 if (cap_from_name(t, &cap) < 0) {
257 log_error("Failed to parse capability %s.", t);
263 arg_retain |= 1ULL << (uint64_t) cap;
270 arg_link_journal = LINK_GUEST;
273 case ARG_LINK_JOURNAL:
274 if (streq(optarg, "auto"))
275 arg_link_journal = LINK_AUTO;
276 else if (streq(optarg, "no"))
277 arg_link_journal = LINK_NO;
278 else if (streq(optarg, "guest"))
279 arg_link_journal = LINK_GUEST;
280 else if (streq(optarg, "host"))
281 arg_link_journal = LINK_HOST;
283 log_error("Failed to parse link journal mode %s", optarg);
291 _cleanup_free_ char *a = NULL, *b = NULL;
296 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298 e = strchr(optarg, ':');
300 a = strndup(optarg, e - optarg);
310 if (!path_is_absolute(a) || !path_is_absolute(b)) {
311 log_error("Invalid bind mount specification: %s", optarg);
315 r = strv_extend(x, a);
319 r = strv_extend(x, b);
330 log_error("Unknown option code %c", c);
338 static int mount_all(const char *dest) {
340 typedef struct MountPoint {
349 static const MountPoint mount_table[] = {
350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368 char _cleanup_free_ *where = NULL;
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
375 t = path_is_mount_point(where, true);
377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
389 mkdir_p(where, 0755);
391 if (mount(mount_table[k].what,
394 mount_table[k].flags,
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
398 log_error("mount(%s) failed: %m", where);
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
414 where = strjoin(dest, "/", *y, NULL);
418 mkdir_p_label(where, 0755);
420 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
421 log_error("mount(%s) failed: %m", where);
425 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
426 log_error("mount(%s) failed: %m", where);
434 static int setup_timezone(const char *dest) {
435 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
441 /* Fix the timezone, if possible */
442 r = readlink_malloc("/etc/localtime", &p);
444 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
448 z = path_startswith(p, "../usr/share/zoneinfo/");
450 z = path_startswith(p, "/usr/share/zoneinfo/");
452 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
456 where = strappend(dest, "/etc/localtime");
460 r = readlink_malloc(where, &q);
462 y = path_startswith(q, "../usr/share/zoneinfo/");
464 y = path_startswith(q, "/usr/share/zoneinfo/");
467 /* Already pointing to the right place? Then do nothing .. */
468 if (y && streq(y, z))
472 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
476 if (access(check, F_OK) < 0) {
477 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
481 what = strappend("../usr/share/zoneinfo/", z);
486 if (symlink(what, where) < 0) {
487 log_error("Failed to correct timezone of container: %m");
494 static int setup_resolv_conf(const char *dest) {
499 if (arg_private_network)
502 /* Fix resolv.conf, if possible */
503 where = strappend(dest, "/etc/resolv.conf");
507 /* We don't really care for the results of this really. If it
508 * fails, it fails, but meh... */
509 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
510 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
517 static int setup_boot_id(const char *dest) {
518 char _cleanup_free_ *from = NULL, *to = NULL;
525 /* Generate a new randomized boot ID, so that each boot-up of
526 * the container gets a new one */
528 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
529 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
533 r = sd_id128_randomize(&rnd);
535 log_error("Failed to generate random boot id: %s", strerror(-r));
539 snprintf(as_uuid, sizeof(as_uuid),
540 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
541 SD_ID128_FORMAT_VAL(rnd));
542 char_array_0(as_uuid);
544 r = write_string_file(from, as_uuid);
546 log_error("Failed to write boot id: %s", strerror(-r));
550 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
551 log_error("Failed to bind mount boot id: %m");
553 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
554 log_warning("Failed to make boot id read-only: %m");
560 static int copy_devnodes(const char *dest) {
562 static const char devnodes[] =
572 mode_t _cleanup_umask_ u;
578 NULSTR_FOREACH(d, devnodes) {
580 char _cleanup_free_ *from = NULL, *to = NULL;
582 asprintf(&from, "/dev/%s", d);
583 asprintf(&to, "%s/dev/%s", dest, d);
594 if (stat(from, &st) < 0) {
596 if (errno != ENOENT) {
597 log_error("Failed to stat %s: %m", from);
602 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
604 log_error("%s is not a char or block device, cannot copy", from);
608 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
610 log_error("mknod(%s) failed: %m", dest);
619 static int setup_ptmx(const char *dest) {
620 _cleanup_free_ char *p = NULL;
622 p = strappend(dest, "/dev/ptmx");
626 if (symlink("pts/ptmx", p) < 0) {
627 log_error("Failed to create /dev/ptmx symlink: %m");
634 static int setup_dev_console(const char *dest, const char *console) {
636 char _cleanup_free_ *to = NULL;
638 mode_t _cleanup_umask_ u;
645 if (stat(console, &st) < 0) {
646 log_error("Failed to stat %s: %m", console);
649 } else if (!S_ISCHR(st.st_mode)) {
650 log_error("/dev/console is not a char device");
654 r = chmod_and_chown(console, 0600, 0, 0);
656 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
660 if (asprintf(&to, "%s/dev/console", dest) < 0)
663 /* We need to bind mount the right tty to /dev/console since
664 * ptys can only exist on pts file systems. To have something
665 * to bind mount things on we create a device node first, that
666 * has the right major/minor (note that the major minor
667 * doesn't actually matter here, since we mount it over
670 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
671 log_error("mknod() for /dev/console failed: %m");
675 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
676 log_error("Bind mount for /dev/console failed: %m");
683 static int setup_kmsg(const char *dest, int kmsg_socket) {
684 char _cleanup_free_ *from = NULL, *to = NULL;
686 mode_t _cleanup_umask_ u;
688 struct cmsghdr cmsghdr;
689 uint8_t buf[CMSG_SPACE(sizeof(int))];
692 .msg_control = &control,
693 .msg_controllen = sizeof(control),
695 struct cmsghdr *cmsg;
698 assert(kmsg_socket >= 0);
702 /* We create the kmsg FIFO as /dev/kmsg, but immediately
703 * delete it after bind mounting it to /proc/kmsg. While FIFOs
704 * on the reading side behave very similar to /proc/kmsg,
705 * their writing side behaves differently from /dev/kmsg in
706 * that writing blocks when nothing is reading. In order to
707 * avoid any problems with containers deadlocking due to this
708 * we simply make /dev/kmsg unavailable to the container. */
709 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
710 asprintf(&to, "%s/proc/kmsg", dest) < 0)
713 if (mkfifo(from, 0600) < 0) {
714 log_error("mkfifo() for /dev/kmsg failed: %m");
718 r = chmod_and_chown(from, 0600, 0, 0);
720 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
724 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
725 log_error("Bind mount for /proc/kmsg failed: %m");
729 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
731 log_error("Failed to open fifo: %m");
735 cmsg = CMSG_FIRSTHDR(&mh);
736 cmsg->cmsg_level = SOL_SOCKET;
737 cmsg->cmsg_type = SCM_RIGHTS;
738 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
739 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
741 mh.msg_controllen = cmsg->cmsg_len;
743 /* Store away the fd in the socket, so that it stays open as
744 * long as we run the child */
745 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
746 close_nointr_nofail(fd);
749 log_error("Failed to send FIFO fd: %m");
753 /* And now make the FIFO unavailable as /dev/kmsg... */
758 static int setup_hostname(void) {
760 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
766 static int setup_journal(const char *directory) {
767 sd_id128_t machine_id;
768 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
772 if (arg_link_journal == LINK_NO)
775 p = strappend(directory, "/etc/machine-id");
779 r = read_one_line_file(p, &b);
780 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
783 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
788 if (isempty(id) && arg_link_journal == LINK_AUTO)
791 /* Verify validity */
792 r = sd_id128_from_string(id, &machine_id);
794 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
799 p = strappend("/var/log/journal/", id);
800 q = strjoin(directory, "/var/log/journal/", id, NULL);
804 if (path_is_mount_point(p, false) > 0) {
805 if (arg_link_journal != LINK_AUTO) {
806 log_error("%s: already a mount point, refusing to use for journal", p);
813 if (path_is_mount_point(q, false) > 0) {
814 if (arg_link_journal != LINK_AUTO) {
815 log_error("%s: already a mount point, refusing to use for journal", q);
822 r = readlink_and_make_absolute(p, &d);
824 if ((arg_link_journal == LINK_GUEST ||
825 arg_link_journal == LINK_AUTO) &&
828 r = mkdir_p(q, 0755);
830 log_warning("failed to create directory %s: %m", q);
835 log_error("Failed to remove symlink %s: %m", p);
838 } else if (r == -EINVAL) {
840 if (arg_link_journal == LINK_GUEST &&
843 if (errno == ENOTDIR) {
844 log_error("%s already exists and is neither a symlink nor a directory", p);
847 log_error("Failed to remove %s: %m", p);
851 } else if (r != -ENOENT) {
852 log_error("readlink(%s) failed: %m", p);
856 if (arg_link_journal == LINK_GUEST) {
858 if (symlink(q, p) < 0) {
859 log_error("Failed to symlink %s to %s: %m", q, p);
863 r = mkdir_p(q, 0755);
865 log_warning("failed to create directory %s: %m", q);
869 if (arg_link_journal == LINK_HOST) {
870 r = mkdir_p(p, 0755);
872 log_error("Failed to create %s: %m", p);
876 } else if (access(p, F_OK) < 0)
879 if (dir_is_empty(q) == 0) {
880 log_error("%s not empty.", q);
884 r = mkdir_p(q, 0755);
886 log_error("Failed to create %s: %m", q);
890 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
891 log_error("Failed to bind mount journal from host into guest: %m");
898 static int setup_cgroup(const char *path) {
902 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
904 log_error("Failed to create cgroup: %s", strerror(-r));
908 STRV_FOREACH(c, arg_controllers) {
909 r = cg_create_and_attach(*c, path, 1);
911 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
917 static int drop_capabilities(void) {
918 return capability_bounding_set_drop(~arg_retain, false);
921 static int is_os_tree(const char *path) {
924 /* We use /bin/sh as flag file if something is an OS */
926 if (asprintf(&p, "%s/bin/sh", path) < 0)
932 return r < 0 ? 0 : 1;
935 static int process_pty(int master, pid_t pid, sigset_t *mask) {
937 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
938 size_t in_buffer_full = 0, out_buffer_full = 0;
939 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
940 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
941 int ep = -1, signal_fd = -1, r;
942 bool tried_orderly_shutdown = false;
948 fd_nonblock(STDIN_FILENO, 1);
949 fd_nonblock(STDOUT_FILENO, 1);
950 fd_nonblock(master, 1);
952 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
954 log_error("signalfd(): %m");
959 ep = epoll_create1(EPOLL_CLOEXEC);
961 log_error("Failed to create epoll: %m");
966 /* We read from STDIN only if this is actually a TTY,
967 * otherwise we assume non-interactivity. */
968 if (isatty(STDIN_FILENO)) {
970 stdin_ev.events = EPOLLIN|EPOLLET;
971 stdin_ev.data.fd = STDIN_FILENO;
973 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
974 log_error("Failed to register STDIN in epoll: %m");
981 stdout_ev.events = EPOLLOUT|EPOLLET;
982 stdout_ev.data.fd = STDOUT_FILENO;
985 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
986 master_ev.data.fd = master;
989 signal_ev.events = EPOLLIN;
990 signal_ev.data.fd = signal_fd;
992 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
993 if (errno != EPERM) {
994 log_error("Failed to register stdout in epoll: %m");
998 /* stdout without epoll support. Likely redirected to regular file. */
999 stdout_writable = true;
1002 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1003 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1004 log_error("Failed to register fds in epoll: %m");
1010 struct epoll_event ev[16];
1014 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1017 if (errno == EINTR || errno == EAGAIN)
1020 log_error("epoll_wait(): %m");
1027 for (i = 0; i < nfds; i++) {
1028 if (ev[i].data.fd == STDIN_FILENO) {
1030 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1031 stdin_readable = true;
1033 } else if (ev[i].data.fd == STDOUT_FILENO) {
1035 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1036 stdout_writable = true;
1038 } else if (ev[i].data.fd == master) {
1040 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1041 master_readable = true;
1043 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1044 master_writable = true;
1046 } else if (ev[i].data.fd == signal_fd) {
1047 struct signalfd_siginfo sfsi;
1050 n = read(signal_fd, &sfsi, sizeof(sfsi));
1051 if (n != sizeof(sfsi)) {
1054 log_error("Failed to read from signalfd: invalid block size");
1059 if (errno != EINTR && errno != EAGAIN) {
1060 log_error("Failed to read from signalfd: %m");
1066 if (sfsi.ssi_signo == SIGWINCH) {
1069 /* The window size changed, let's forward that. */
1070 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1071 ioctl(master, TIOCSWINSZ, &ws);
1072 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1074 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1076 /* This only works for systemd... */
1077 tried_orderly_shutdown = true;
1078 kill(pid, SIGRTMIN+3);
1088 while ((stdin_readable && in_buffer_full <= 0) ||
1089 (master_writable && in_buffer_full > 0) ||
1090 (master_readable && out_buffer_full <= 0) ||
1091 (stdout_writable && out_buffer_full > 0)) {
1093 if (stdin_readable && in_buffer_full < LINE_MAX) {
1095 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1098 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1099 stdin_readable = false;
1101 log_error("read(): %m");
1106 in_buffer_full += (size_t) k;
1109 if (master_writable && in_buffer_full > 0) {
1111 k = write(master, in_buffer, in_buffer_full);
1114 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1115 master_writable = false;
1117 log_error("write(): %m");
1123 assert(in_buffer_full >= (size_t) k);
1124 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1125 in_buffer_full -= k;
1129 if (master_readable && out_buffer_full < LINE_MAX) {
1131 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1134 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1135 master_readable = false;
1137 log_error("read(): %m");
1142 out_buffer_full += (size_t) k;
1145 if (stdout_writable && out_buffer_full > 0) {
1147 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1150 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1151 stdout_writable = false;
1153 log_error("write(): %m");
1159 assert(out_buffer_full >= (size_t) k);
1160 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1161 out_buffer_full -= k;
1169 close_nointr_nofail(ep);
1172 close_nointr_nofail(signal_fd);
1177 int main(int argc, char *argv[]) {
1179 int r = EXIT_FAILURE, k;
1180 _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1181 _cleanup_close_ int master = -1;
1183 const char *console = NULL;
1184 struct termios saved_attr, raw_attr;
1186 bool saved_attr_valid = false;
1188 int kmsg_socket_pair[2] = { -1, -1 };
1191 log_parse_environment();
1194 r = parse_argv(argc, argv);
1198 if (arg_directory) {
1201 p = path_make_absolute_cwd(arg_directory);
1202 free(arg_directory);
1205 arg_directory = get_current_dir_name();
1207 if (!arg_directory) {
1208 log_error("Failed to determine path");
1212 path_kill_slashes(arg_directory);
1215 arg_machine = strdup(path_get_file_name(arg_directory));
1221 hostname_cleanup(arg_machine);
1222 if (isempty(arg_machine)) {
1223 log_error("Failed to determine machine name automatically, please use -M.");
1228 if (geteuid() != 0) {
1229 log_error("Need to be root.");
1233 if (sd_booted() <= 0) {
1234 log_error("Not running on a systemd system.");
1238 if (path_equal(arg_directory, "/")) {
1239 log_error("Spawning container on root directory not supported.");
1243 if (is_os_tree(arg_directory) <= 0) {
1244 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1249 n_fd_passed = sd_listen_fds(false);
1250 if (n_fd_passed > 0) {
1251 k = fdset_new_listen_fds(&fds, false);
1253 log_error("Failed to collect file descriptors: %s", strerror(-k));
1257 fdset_close_others(fds);
1260 k = cg_get_machine_path(&machine_root);
1262 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1266 newcg = strjoin(machine_root, "/", arg_machine, NULL);
1268 log_error("Failed to allocate cgroup path.");
1272 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1273 if (r <= 0 && r != -ENOENT) {
1274 log_error("Container already running.");
1282 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1284 log_error("Failed to acquire pseudo tty: %m");
1288 console = ptsname(master);
1290 log_error("Failed to determine tty name: %m");
1294 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1296 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1297 ioctl(master, TIOCSWINSZ, &ws);
1299 if (unlockpt(master) < 0) {
1300 log_error("Failed to unlock tty: %m");
1304 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1305 saved_attr_valid = true;
1307 raw_attr = saved_attr;
1308 cfmakeraw(&raw_attr);
1309 raw_attr.c_lflag &= ~ECHO;
1312 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1313 log_error("Failed to create kmsg socket pair.");
1317 assert_se(sigemptyset(&mask) == 0);
1318 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1319 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1325 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1326 log_error("pipe2(): %m");
1330 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1332 if (errno == EINVAL)
1333 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1335 log_error("clone() failed: %m");
1342 const char *home = NULL;
1343 uid_t uid = (uid_t) -1;
1344 gid_t gid = (gid_t) -1;
1346 const char *envp[] = {
1347 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1348 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1353 NULL, /* container_uuid */
1354 NULL, /* LISTEN_FDS */
1355 NULL, /* LISTEN_PID */
1359 envp[n_env] = strv_find_prefix(environ, "TERM=");
1363 close_nointr_nofail(pipefd[1]);
1364 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1365 close_nointr_nofail(pipefd[0]);
1367 close_nointr_nofail(master);
1370 if (saved_attr_valid) {
1371 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1372 log_error("Failed to set terminal attributes: %m");
1377 close_nointr(STDIN_FILENO);
1378 close_nointr(STDOUT_FILENO);
1379 close_nointr(STDERR_FILENO);
1381 close_nointr_nofail(kmsg_socket_pair[0]);
1382 kmsg_socket_pair[0] = -1;
1384 reset_all_signal_handlers();
1386 assert_se(sigemptyset(&mask) == 0);
1387 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1389 k = open_terminal(console, O_RDWR);
1390 if (k != STDIN_FILENO) {
1392 close_nointr_nofail(k);
1396 log_error("Failed to open console: %s", strerror(-k));
1400 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1401 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1402 log_error("Failed to duplicate console: %m");
1407 log_error("setsid() failed: %m");
1411 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1412 log_error("PR_SET_PDEATHSIG failed: %m");
1416 if (setup_cgroup(newcg) < 0)
1419 /* Mark everything as slave, so that we still
1420 * receive mounts from the real root, but don't
1421 * propagate mounts to the real root. */
1422 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1423 log_error("MS_SLAVE|MS_REC failed: %m");
1427 /* Turn directory into bind mount */
1428 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1429 log_error("Failed to make bind mount.");
1434 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1435 log_error("Failed to make read-only.");
1439 if (mount_all(arg_directory) < 0)
1442 if (copy_devnodes(arg_directory) < 0)
1445 if (setup_ptmx(arg_directory) < 0)
1448 dev_setup(arg_directory);
1450 if (setup_dev_console(arg_directory, console) < 0)
1453 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1456 close_nointr_nofail(kmsg_socket_pair[1]);
1457 kmsg_socket_pair[1] = -1;
1459 if (setup_boot_id(arg_directory) < 0)
1462 if (setup_timezone(arg_directory) < 0)
1465 if (setup_resolv_conf(arg_directory) < 0)
1468 if (setup_journal(arg_directory) < 0)
1471 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1474 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1477 if (chdir(arg_directory) < 0) {
1478 log_error("chdir(%s) failed: %m", arg_directory);
1482 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1483 log_error("mount(MS_MOVE) failed: %m");
1487 if (chroot(".") < 0) {
1488 log_error("chroot() failed: %m");
1492 if (chdir("/") < 0) {
1493 log_error("chdir() failed: %m");
1501 if (drop_capabilities() < 0) {
1502 log_error("drop_capabilities() failed: %m");
1508 /* Note that this resolves user names
1509 * inside the container, and hence
1510 * accesses the NSS modules from the
1511 * container and not the host. This is
1514 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1515 log_error("get_user_creds() failed: %m");
1519 if (mkdir_parents_label(home, 0775) < 0) {
1520 log_error("mkdir_parents_label() failed: %m");
1524 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1525 log_error("mkdir_safe_label() failed: %m");
1529 if (initgroups((const char*)arg_user, gid) < 0) {
1530 log_error("initgroups() failed: %m");
1534 if (setresgid(gid, gid, gid) < 0) {
1535 log_error("setregid() failed: %m");
1539 if (setresuid(uid, uid, uid) < 0) {
1540 log_error("setreuid() failed: %m");
1544 /* Reset everything fully to 0, just in case */
1546 if (setgroups(0, NULL) < 0) {
1547 log_error("setgroups() failed: %m");
1551 if (setresgid(0, 0, 0) < 0) {
1552 log_error("setregid() failed: %m");
1556 if (setresuid(0, 0, 0) < 0) {
1557 log_error("setreuid() failed: %m");
1562 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1563 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1564 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1570 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1576 if (fdset_size(fds) > 0) {
1577 k = fdset_cloexec(fds, false);
1579 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1583 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1584 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1596 /* Automatically search for the init system */
1598 l = 1 + argc - optind;
1599 a = newa(char*, l + 1);
1600 memcpy(a + 1, argv + optind, l * sizeof(char*));
1602 a[0] = (char*) "/usr/lib/systemd/systemd";
1603 execve(a[0], a, (char**) envp);
1605 a[0] = (char*) "/lib/systemd/systemd";
1606 execve(a[0], a, (char**) envp);
1608 a[0] = (char*) "/sbin/init";
1609 execve(a[0], a, (char**) envp);
1610 } else if (argc > optind)
1611 execvpe(argv[optind], argv + optind, (char**) envp);
1613 chdir(home ? home : "/root");
1614 execle("/bin/bash", "-bash", NULL, (char**) envp);
1617 log_error("execv() failed: %m");
1620 _exit(EXIT_FAILURE);
1623 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1624 close_nointr_nofail(pipefd[0]);
1625 close_nointr_nofail(pipefd[1]);
1630 if (process_pty(master, pid, &mask) < 0)
1633 if (saved_attr_valid)
1634 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1636 r = wait_for_terminate(pid, &status);
1642 if (status.si_code == CLD_EXITED) {
1643 if (status.si_status != 0) {
1644 log_error("Container failed with error code %i.", status.si_status);
1645 r = status.si_status;
1649 log_debug("Container exited successfully.");
1651 } else if (status.si_code == CLD_KILLED &&
1652 status.si_status == SIGINT) {
1653 log_info("Container has been shut down.");
1656 } else if (status.si_code == CLD_KILLED &&
1657 status.si_status == SIGHUP) {
1658 log_info("Container is being rebooted.");
1660 } else if (status.si_code == CLD_KILLED ||
1661 status.si_code == CLD_DUMPED) {
1663 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1667 log_error("Container failed due to unknown reason.");
1674 if (saved_attr_valid)
1675 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1677 close_pipe(kmsg_socket_pair);
1680 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1682 free(arg_directory);
1684 strv_free(arg_controllers);