1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
67 typedef enum LinkJournal {
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static char *arg_machine = NULL;
79 static bool arg_private_network = false;
80 static bool arg_read_only = false;
81 static bool arg_boot = false;
82 static LinkJournal arg_link_journal = LINK_AUTO;
83 static uint64_t arg_retain =
85 (1ULL << CAP_DAC_OVERRIDE) |
86 (1ULL << CAP_DAC_READ_SEARCH) |
87 (1ULL << CAP_FOWNER) |
88 (1ULL << CAP_FSETID) |
89 (1ULL << CAP_IPC_OWNER) |
92 (1ULL << CAP_LINUX_IMMUTABLE) |
93 (1ULL << CAP_NET_BIND_SERVICE) |
94 (1ULL << CAP_NET_BROADCAST) |
95 (1ULL << CAP_NET_RAW) |
96 (1ULL << CAP_SETGID) |
97 (1ULL << CAP_SETFCAP) |
98 (1ULL << CAP_SETPCAP) |
99 (1ULL << CAP_SETUID) |
100 (1ULL << CAP_SYS_ADMIN) |
101 (1ULL << CAP_SYS_CHROOT) |
102 (1ULL << CAP_SYS_NICE) |
103 (1ULL << CAP_SYS_PTRACE) |
104 (1ULL << CAP_SYS_TTY_CONFIG) |
105 (1ULL << CAP_SYS_RESOURCE) |
106 (1ULL << CAP_SYS_BOOT) |
107 (1ULL << CAP_AUDIT_WRITE) |
108 (1ULL << CAP_AUDIT_CONTROL);
109 static char **arg_bind = NULL;
110 static char **arg_bind_ro = NULL;
112 static int help(void) {
114 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
115 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
116 " -h --help Show this help\n"
117 " --version Print version string\n"
118 " -D --directory=NAME Root directory for the container\n"
119 " -b --boot Boot up full system (i.e. invoke init)\n"
120 " -u --user=USER Run the command under specified user or uid\n"
121 " -C --controllers=LIST Put the container in specified comma-separated\n"
122 " cgroup hierarchies\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " --private-network Disable network in container\n"
126 " --read-only Mount the root directory read-only\n"
127 " --capability=CAP In addition to the default, retain specified\n"
129 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
130 " -j Equivalent to --link-journal=host\n"
131 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
133 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
134 program_invocation_short_name);
139 static int parse_argv(int argc, char *argv[]) {
152 static const struct option options[] = {
153 { "help", no_argument, NULL, 'h' },
154 { "version", no_argument, NULL, ARG_VERSION },
155 { "directory", required_argument, NULL, 'D' },
156 { "user", required_argument, NULL, 'u' },
157 { "controllers", required_argument, NULL, 'C' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
159 { "boot", no_argument, NULL, 'b' },
160 { "uuid", required_argument, NULL, ARG_UUID },
161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
162 { "capability", required_argument, NULL, ARG_CAPABILITY },
163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
166 { "machine", required_argument, NULL, 'M' },
175 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
200 arg_user = strdup(optarg);
207 strv_free(arg_controllers);
208 arg_controllers = strv_split(optarg, ",");
209 if (!arg_controllers)
212 cg_shorten_controllers(arg_controllers);
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
228 if (!hostname_is_valid(optarg)) {
229 log_error("Invalid machine name: %s", optarg);
234 arg_machine = strdup(optarg);
241 arg_read_only = true;
244 case ARG_CAPABILITY: {
248 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
252 t = strndup(word, length);
256 if (cap_from_name(t, &cap) < 0) {
257 log_error("Failed to parse capability %s.", t);
263 arg_retain |= 1ULL << (uint64_t) cap;
270 arg_link_journal = LINK_GUEST;
273 case ARG_LINK_JOURNAL:
274 if (streq(optarg, "auto"))
275 arg_link_journal = LINK_AUTO;
276 else if (streq(optarg, "no"))
277 arg_link_journal = LINK_NO;
278 else if (streq(optarg, "guest"))
279 arg_link_journal = LINK_GUEST;
280 else if (streq(optarg, "host"))
281 arg_link_journal = LINK_HOST;
283 log_error("Failed to parse link journal mode %s", optarg);
291 _cleanup_free_ char *a = NULL, *b = NULL;
296 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298 e = strchr(optarg, ':');
300 a = strndup(optarg, e - optarg);
310 if (!path_is_absolute(a) || !path_is_absolute(b)) {
311 log_error("Invalid bind mount specification: %s", optarg);
315 r = strv_extend(x, a);
319 r = strv_extend(x, b);
330 log_error("Unknown option code %c", c);
338 static int mount_all(const char *dest) {
340 typedef struct MountPoint {
349 static const MountPoint mount_table[] = {
350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368 char _cleanup_free_ *where = NULL;
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
375 t = path_is_mount_point(where, true);
377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
389 mkdir_p(where, 0755);
391 if (mount(mount_table[k].what,
394 mount_table[k].flags,
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
398 log_error("mount(%s) failed: %m", where);
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
414 where = strjoin(dest, "/", *y, NULL);
418 mkdir_p_label(where, 0755);
420 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
421 log_error("mount(%s) failed: %m", where);
425 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
426 log_error("mount(%s) failed: %m", where);
434 static int setup_timezone(const char *dest) {
435 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
441 /* Fix the timezone, if possible */
442 r = readlink_malloc("/etc/localtime", &p);
444 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
448 z = path_startswith(p, "../usr/share/zoneinfo/");
450 z = path_startswith(p, "/usr/share/zoneinfo/");
452 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
456 where = strappend(dest, "/etc/localtime");
460 r = readlink_malloc(where, &q);
462 y = path_startswith(q, "../usr/share/zoneinfo/");
464 y = path_startswith(q, "/usr/share/zoneinfo/");
467 /* Already pointing to the right place? Then do nothing .. */
468 if (y && streq(y, z))
472 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
476 if (access(check, F_OK) < 0) {
477 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
481 what = strappend("../usr/share/zoneinfo/", z);
486 if (symlink(what, where) < 0) {
487 log_error("Failed to correct timezone of container: %m");
494 static int setup_resolv_conf(const char *dest) {
499 if (arg_private_network)
502 /* Fix resolv.conf, if possible */
503 where = strappend(dest, "/etc/resolv.conf");
507 /* We don't really care for the results of this really. If it
508 * fails, it fails, but meh... */
509 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
510 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
517 static int setup_boot_id(const char *dest) {
518 char _cleanup_free_ *from = NULL, *to = NULL;
525 /* Generate a new randomized boot ID, so that each boot-up of
526 * the container gets a new one */
528 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
529 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
533 r = sd_id128_randomize(&rnd);
535 log_error("Failed to generate random boot id: %s", strerror(-r));
539 snprintf(as_uuid, sizeof(as_uuid),
540 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
541 SD_ID128_FORMAT_VAL(rnd));
542 char_array_0(as_uuid);
544 r = write_string_file(from, as_uuid);
546 log_error("Failed to write boot id: %s", strerror(-r));
550 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
551 log_error("Failed to bind mount boot id: %m");
553 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
554 log_warning("Failed to make boot id read-only: %m");
560 static int copy_devnodes(const char *dest) {
562 static const char devnodes[] =
572 mode_t _cleanup_umask_ u;
578 NULSTR_FOREACH(d, devnodes) {
580 char _cleanup_free_ *from = NULL, *to = NULL;
582 asprintf(&from, "/dev/%s", d);
583 asprintf(&to, "%s/dev/%s", dest, d);
594 if (stat(from, &st) < 0) {
596 if (errno != ENOENT) {
597 log_error("Failed to stat %s: %m", from);
602 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
604 log_error("%s is not a char or block device, cannot copy", from);
608 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
610 log_error("mknod(%s) failed: %m", dest);
619 static int setup_ptmx(const char *dest) {
620 _cleanup_free_ char *p = NULL;
622 p = strappend(dest, "/dev/ptmx");
626 if (symlink("pts/ptmx", p) < 0) {
627 log_error("Failed to create /dev/ptmx symlink: %m");
634 static int setup_dev_console(const char *dest, const char *console) {
636 char _cleanup_free_ *to = NULL;
638 mode_t _cleanup_umask_ u;
645 if (stat(console, &st) < 0) {
646 log_error("Failed to stat %s: %m", console);
649 } else if (!S_ISCHR(st.st_mode)) {
650 log_error("/dev/console is not a char device");
654 r = chmod_and_chown(console, 0600, 0, 0);
656 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
660 if (asprintf(&to, "%s/dev/console", dest) < 0)
663 /* We need to bind mount the right tty to /dev/console since
664 * ptys can only exist on pts file systems. To have something
665 * to bind mount things on we create a device node first, that
666 * has the right major/minor (note that the major minor
667 * doesn't actually matter here, since we mount it over
670 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
671 log_error("mknod() for /dev/console failed: %m");
675 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
676 log_error("Bind mount for /dev/console failed: %m");
683 static int setup_kmsg(const char *dest, int kmsg_socket) {
684 char _cleanup_free_ *from = NULL, *to = NULL;
686 mode_t _cleanup_umask_ u;
688 struct cmsghdr cmsghdr;
689 uint8_t buf[CMSG_SPACE(sizeof(int))];
692 .msg_control = &control,
693 .msg_controllen = sizeof(control),
695 struct cmsghdr *cmsg;
698 assert(kmsg_socket >= 0);
702 /* We create the kmsg FIFO as /dev/kmsg, but immediately
703 * delete it after bind mounting it to /proc/kmsg. While FIFOs
704 * on the reading side behave very similar to /proc/kmsg,
705 * their writing side behaves differently from /dev/kmsg in
706 * that writing blocks when nothing is reading. In order to
707 * avoid any problems with containers deadlocking due to this
708 * we simply make /dev/kmsg unavailable to the container. */
709 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
710 asprintf(&to, "%s/proc/kmsg", dest) < 0)
713 if (mkfifo(from, 0600) < 0) {
714 log_error("mkfifo() for /dev/kmsg failed: %m");
718 r = chmod_and_chown(from, 0600, 0, 0);
720 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
724 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
725 log_error("Bind mount for /proc/kmsg failed: %m");
729 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
731 log_error("Failed to open fifo: %m");
735 cmsg = CMSG_FIRSTHDR(&mh);
736 cmsg->cmsg_level = SOL_SOCKET;
737 cmsg->cmsg_type = SCM_RIGHTS;
738 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
739 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
741 mh.msg_controllen = cmsg->cmsg_len;
743 /* Store away the fd in the socket, so that it stays open as
744 * long as we run the child */
745 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
746 close_nointr_nofail(fd);
749 log_error("Failed to send FIFO fd: %m");
753 /* And now make the FIFO unavailable as /dev/kmsg... */
758 static int setup_hostname(void) {
760 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
766 static int setup_journal(const char *directory) {
767 sd_id128_t machine_id;
768 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
772 if (arg_link_journal == LINK_NO)
775 p = strappend(directory, "/etc/machine-id");
779 r = read_one_line_file(p, &b);
780 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
783 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
788 if (isempty(id) && arg_link_journal == LINK_AUTO)
791 /* Verify validity */
792 r = sd_id128_from_string(id, &machine_id);
794 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
799 p = strappend("/var/log/journal/", id);
800 q = strjoin(directory, "/var/log/journal/", id, NULL);
804 if (path_is_mount_point(p, false) > 0) {
805 if (arg_link_journal != LINK_AUTO) {
806 log_error("%s: already a mount point, refusing to use for journal", p);
813 if (path_is_mount_point(q, false) > 0) {
814 if (arg_link_journal != LINK_AUTO) {
815 log_error("%s: already a mount point, refusing to use for journal", q);
822 r = readlink_and_make_absolute(p, &d);
824 if ((arg_link_journal == LINK_GUEST ||
825 arg_link_journal == LINK_AUTO) &&
828 r = mkdir_p(q, 0755);
830 log_warning("failed to create directory %s: %m", q);
835 log_error("Failed to remove symlink %s: %m", p);
838 } else if (r == -EINVAL) {
840 if (arg_link_journal == LINK_GUEST &&
843 if (errno == ENOTDIR) {
844 log_error("%s already exists and is neither a symlink nor a directory", p);
847 log_error("Failed to remove %s: %m", p);
851 } else if (r != -ENOENT) {
852 log_error("readlink(%s) failed: %m", p);
856 if (arg_link_journal == LINK_GUEST) {
858 if (symlink(q, p) < 0) {
859 log_error("Failed to symlink %s to %s: %m", q, p);
863 r = mkdir_p(q, 0755);
865 log_warning("failed to create directory %s: %m", q);
869 if (arg_link_journal == LINK_HOST) {
870 r = mkdir_p(p, 0755);
872 log_error("Failed to create %s: %m", p);
876 } else if (access(p, F_OK) < 0)
879 if (dir_is_empty(q) == 0) {
880 log_error("%s not empty.", q);
884 r = mkdir_p(q, 0755);
886 log_error("Failed to create %s: %m", q);
890 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
891 log_error("Failed to bind mount journal from host into guest: %m");
898 static int setup_cgroup(const char *path) {
902 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
904 log_error("Failed to create cgroup: %s", strerror(-r));
908 STRV_FOREACH(c, arg_controllers) {
909 r = cg_create_and_attach(*c, path, 1);
911 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
917 static int drop_capabilities(void) {
918 return capability_bounding_set_drop(~arg_retain, false);
921 static int process_pty(int master, pid_t pid, sigset_t *mask) {
923 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
924 size_t in_buffer_full = 0, out_buffer_full = 0;
925 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
926 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
927 int ep = -1, signal_fd = -1, r;
928 bool tried_orderly_shutdown = false;
934 fd_nonblock(STDIN_FILENO, 1);
935 fd_nonblock(STDOUT_FILENO, 1);
936 fd_nonblock(master, 1);
938 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
940 log_error("signalfd(): %m");
945 ep = epoll_create1(EPOLL_CLOEXEC);
947 log_error("Failed to create epoll: %m");
952 /* We read from STDIN only if this is actually a TTY,
953 * otherwise we assume non-interactivity. */
954 if (isatty(STDIN_FILENO)) {
956 stdin_ev.events = EPOLLIN|EPOLLET;
957 stdin_ev.data.fd = STDIN_FILENO;
959 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
960 log_error("Failed to register STDIN in epoll: %m");
967 stdout_ev.events = EPOLLOUT|EPOLLET;
968 stdout_ev.data.fd = STDOUT_FILENO;
971 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
972 master_ev.data.fd = master;
975 signal_ev.events = EPOLLIN;
976 signal_ev.data.fd = signal_fd;
978 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
979 if (errno != EPERM) {
980 log_error("Failed to register stdout in epoll: %m");
984 /* stdout without epoll support. Likely redirected to regular file. */
985 stdout_writable = true;
988 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
989 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
990 log_error("Failed to register fds in epoll: %m");
996 struct epoll_event ev[16];
1000 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1003 if (errno == EINTR || errno == EAGAIN)
1006 log_error("epoll_wait(): %m");
1013 for (i = 0; i < nfds; i++) {
1014 if (ev[i].data.fd == STDIN_FILENO) {
1016 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1017 stdin_readable = true;
1019 } else if (ev[i].data.fd == STDOUT_FILENO) {
1021 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1022 stdout_writable = true;
1024 } else if (ev[i].data.fd == master) {
1026 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1027 master_readable = true;
1029 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1030 master_writable = true;
1032 } else if (ev[i].data.fd == signal_fd) {
1033 struct signalfd_siginfo sfsi;
1036 n = read(signal_fd, &sfsi, sizeof(sfsi));
1037 if (n != sizeof(sfsi)) {
1040 log_error("Failed to read from signalfd: invalid block size");
1045 if (errno != EINTR && errno != EAGAIN) {
1046 log_error("Failed to read from signalfd: %m");
1052 if (sfsi.ssi_signo == SIGWINCH) {
1055 /* The window size changed, let's forward that. */
1056 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1057 ioctl(master, TIOCSWINSZ, &ws);
1058 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1060 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1062 /* This only works for systemd... */
1063 tried_orderly_shutdown = true;
1064 kill(pid, SIGRTMIN+3);
1074 while ((stdin_readable && in_buffer_full <= 0) ||
1075 (master_writable && in_buffer_full > 0) ||
1076 (master_readable && out_buffer_full <= 0) ||
1077 (stdout_writable && out_buffer_full > 0)) {
1079 if (stdin_readable && in_buffer_full < LINE_MAX) {
1081 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1084 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1085 stdin_readable = false;
1087 log_error("read(): %m");
1092 in_buffer_full += (size_t) k;
1095 if (master_writable && in_buffer_full > 0) {
1097 k = write(master, in_buffer, in_buffer_full);
1100 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1101 master_writable = false;
1103 log_error("write(): %m");
1109 assert(in_buffer_full >= (size_t) k);
1110 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1111 in_buffer_full -= k;
1115 if (master_readable && out_buffer_full < LINE_MAX) {
1117 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1120 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1121 master_readable = false;
1123 log_error("read(): %m");
1128 out_buffer_full += (size_t) k;
1131 if (stdout_writable && out_buffer_full > 0) {
1133 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1136 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1137 stdout_writable = false;
1139 log_error("write(): %m");
1145 assert(out_buffer_full >= (size_t) k);
1146 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1147 out_buffer_full -= k;
1155 close_nointr_nofail(ep);
1158 close_nointr_nofail(signal_fd);
1163 int main(int argc, char *argv[]) {
1165 int r = EXIT_FAILURE, k;
1166 _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1167 _cleanup_close_ int master = -1;
1169 const char *console = NULL;
1170 struct termios saved_attr, raw_attr;
1172 bool saved_attr_valid = false;
1174 int kmsg_socket_pair[2] = { -1, -1 };
1177 log_parse_environment();
1180 r = parse_argv(argc, argv);
1184 if (arg_directory) {
1187 p = path_make_absolute_cwd(arg_directory);
1188 free(arg_directory);
1191 arg_directory = get_current_dir_name();
1193 if (!arg_directory) {
1194 log_error("Failed to determine path");
1198 path_kill_slashes(arg_directory);
1201 arg_machine = strdup(path_get_file_name(arg_directory));
1207 hostname_cleanup(arg_machine);
1208 if (isempty(arg_machine)) {
1209 log_error("Failed to determine machine name automatically, please use -M.");
1214 if (geteuid() != 0) {
1215 log_error("Need to be root.");
1219 if (sd_booted() <= 0) {
1220 log_error("Not running on a systemd system.");
1224 if (path_equal(arg_directory, "/")) {
1225 log_error("Spawning container on root directory not supported.");
1229 if (path_is_os_tree(arg_directory) <= 0) {
1230 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1235 n_fd_passed = sd_listen_fds(false);
1236 if (n_fd_passed > 0) {
1237 k = fdset_new_listen_fds(&fds, false);
1239 log_error("Failed to collect file descriptors: %s", strerror(-k));
1243 fdset_close_others(fds);
1246 k = cg_get_machine_path(&machine_root);
1248 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1252 newcg = strjoin(machine_root, "/", arg_machine, NULL);
1254 log_error("Failed to allocate cgroup path.");
1258 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1259 if (r <= 0 && r != -ENOENT) {
1260 log_error("Container already running.");
1268 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1270 log_error("Failed to acquire pseudo tty: %m");
1274 console = ptsname(master);
1276 log_error("Failed to determine tty name: %m");
1280 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1282 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1283 ioctl(master, TIOCSWINSZ, &ws);
1285 if (unlockpt(master) < 0) {
1286 log_error("Failed to unlock tty: %m");
1290 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1291 saved_attr_valid = true;
1293 raw_attr = saved_attr;
1294 cfmakeraw(&raw_attr);
1295 raw_attr.c_lflag &= ~ECHO;
1298 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1299 log_error("Failed to create kmsg socket pair.");
1303 assert_se(sigemptyset(&mask) == 0);
1304 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1305 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1311 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1312 log_error("pipe2(): %m");
1316 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1318 if (errno == EINVAL)
1319 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1321 log_error("clone() failed: %m");
1328 const char *home = NULL;
1329 uid_t uid = (uid_t) -1;
1330 gid_t gid = (gid_t) -1;
1332 const char *envp[] = {
1333 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1334 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1339 NULL, /* container_uuid */
1340 NULL, /* LISTEN_FDS */
1341 NULL, /* LISTEN_PID */
1345 envp[n_env] = strv_find_prefix(environ, "TERM=");
1349 close_nointr_nofail(pipefd[1]);
1350 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1351 close_nointr_nofail(pipefd[0]);
1353 close_nointr_nofail(master);
1356 if (saved_attr_valid) {
1357 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1358 log_error("Failed to set terminal attributes: %m");
1363 close_nointr(STDIN_FILENO);
1364 close_nointr(STDOUT_FILENO);
1365 close_nointr(STDERR_FILENO);
1367 close_nointr_nofail(kmsg_socket_pair[0]);
1368 kmsg_socket_pair[0] = -1;
1370 reset_all_signal_handlers();
1372 assert_se(sigemptyset(&mask) == 0);
1373 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1375 k = open_terminal(console, O_RDWR);
1376 if (k != STDIN_FILENO) {
1378 close_nointr_nofail(k);
1382 log_error("Failed to open console: %s", strerror(-k));
1386 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1387 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1388 log_error("Failed to duplicate console: %m");
1393 log_error("setsid() failed: %m");
1397 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1398 log_error("PR_SET_PDEATHSIG failed: %m");
1402 if (setup_cgroup(newcg) < 0)
1405 /* Mark everything as slave, so that we still
1406 * receive mounts from the real root, but don't
1407 * propagate mounts to the real root. */
1408 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1409 log_error("MS_SLAVE|MS_REC failed: %m");
1413 /* Turn directory into bind mount */
1414 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1415 log_error("Failed to make bind mount.");
1420 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1421 log_error("Failed to make read-only.");
1425 if (mount_all(arg_directory) < 0)
1428 if (copy_devnodes(arg_directory) < 0)
1431 if (setup_ptmx(arg_directory) < 0)
1434 dev_setup(arg_directory);
1436 if (setup_dev_console(arg_directory, console) < 0)
1439 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1442 close_nointr_nofail(kmsg_socket_pair[1]);
1443 kmsg_socket_pair[1] = -1;
1445 if (setup_boot_id(arg_directory) < 0)
1448 if (setup_timezone(arg_directory) < 0)
1451 if (setup_resolv_conf(arg_directory) < 0)
1454 if (setup_journal(arg_directory) < 0)
1457 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1460 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1463 if (chdir(arg_directory) < 0) {
1464 log_error("chdir(%s) failed: %m", arg_directory);
1468 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1469 log_error("mount(MS_MOVE) failed: %m");
1473 if (chroot(".") < 0) {
1474 log_error("chroot() failed: %m");
1478 if (chdir("/") < 0) {
1479 log_error("chdir() failed: %m");
1487 if (drop_capabilities() < 0) {
1488 log_error("drop_capabilities() failed: %m");
1494 /* Note that this resolves user names
1495 * inside the container, and hence
1496 * accesses the NSS modules from the
1497 * container and not the host. This is
1500 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1501 log_error("get_user_creds() failed: %m");
1505 if (mkdir_parents_label(home, 0775) < 0) {
1506 log_error("mkdir_parents_label() failed: %m");
1510 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1511 log_error("mkdir_safe_label() failed: %m");
1515 if (initgroups((const char*)arg_user, gid) < 0) {
1516 log_error("initgroups() failed: %m");
1520 if (setresgid(gid, gid, gid) < 0) {
1521 log_error("setregid() failed: %m");
1525 if (setresuid(uid, uid, uid) < 0) {
1526 log_error("setreuid() failed: %m");
1530 /* Reset everything fully to 0, just in case */
1532 if (setgroups(0, NULL) < 0) {
1533 log_error("setgroups() failed: %m");
1537 if (setresgid(0, 0, 0) < 0) {
1538 log_error("setregid() failed: %m");
1542 if (setresuid(0, 0, 0) < 0) {
1543 log_error("setreuid() failed: %m");
1548 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1549 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1550 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1556 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1562 if (fdset_size(fds) > 0) {
1563 k = fdset_cloexec(fds, false);
1565 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1569 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1570 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1582 /* Automatically search for the init system */
1584 l = 1 + argc - optind;
1585 a = newa(char*, l + 1);
1586 memcpy(a + 1, argv + optind, l * sizeof(char*));
1588 a[0] = (char*) "/usr/lib/systemd/systemd";
1589 execve(a[0], a, (char**) envp);
1591 a[0] = (char*) "/lib/systemd/systemd";
1592 execve(a[0], a, (char**) envp);
1594 a[0] = (char*) "/sbin/init";
1595 execve(a[0], a, (char**) envp);
1596 } else if (argc > optind)
1597 execvpe(argv[optind], argv + optind, (char**) envp);
1599 chdir(home ? home : "/root");
1600 execle("/bin/bash", "-bash", NULL, (char**) envp);
1603 log_error("execv() failed: %m");
1606 _exit(EXIT_FAILURE);
1609 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1610 close_nointr_nofail(pipefd[0]);
1611 close_nointr_nofail(pipefd[1]);
1616 if (process_pty(master, pid, &mask) < 0)
1619 if (saved_attr_valid)
1620 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1622 r = wait_for_terminate(pid, &status);
1628 if (status.si_code == CLD_EXITED) {
1629 if (status.si_status != 0) {
1630 log_error("Container failed with error code %i.", status.si_status);
1631 r = status.si_status;
1635 log_debug("Container exited successfully.");
1637 } else if (status.si_code == CLD_KILLED &&
1638 status.si_status == SIGINT) {
1639 log_info("Container has been shut down.");
1642 } else if (status.si_code == CLD_KILLED &&
1643 status.si_status == SIGHUP) {
1644 log_info("Container is being rebooted.");
1646 } else if (status.si_code == CLD_KILLED ||
1647 status.si_code == CLD_DUMPED) {
1649 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1653 log_error("Container failed due to unknown reason.");
1660 if (saved_attr_valid)
1661 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1663 close_pipe(kmsg_socket_pair);
1666 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1668 free(arg_directory);
1670 strv_free(arg_controllers);