1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
45 #include <systemd/sd-daemon.h>
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
58 #include "dev-setup.h"
67 typedef enum LinkJournal {
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static char *arg_machine = NULL;
79 static bool arg_private_network = false;
80 static bool arg_read_only = false;
81 static bool arg_boot = false;
82 static LinkJournal arg_link_journal = LINK_AUTO;
83 static uint64_t arg_retain =
85 (1ULL << CAP_DAC_OVERRIDE) |
86 (1ULL << CAP_DAC_READ_SEARCH) |
87 (1ULL << CAP_FOWNER) |
88 (1ULL << CAP_FSETID) |
89 (1ULL << CAP_IPC_OWNER) |
92 (1ULL << CAP_LINUX_IMMUTABLE) |
93 (1ULL << CAP_NET_BIND_SERVICE) |
94 (1ULL << CAP_NET_BROADCAST) |
95 (1ULL << CAP_NET_RAW) |
96 (1ULL << CAP_SETGID) |
97 (1ULL << CAP_SETFCAP) |
98 (1ULL << CAP_SETPCAP) |
99 (1ULL << CAP_SETUID) |
100 (1ULL << CAP_SYS_ADMIN) |
101 (1ULL << CAP_SYS_CHROOT) |
102 (1ULL << CAP_SYS_NICE) |
103 (1ULL << CAP_SYS_PTRACE) |
104 (1ULL << CAP_SYS_TTY_CONFIG) |
105 (1ULL << CAP_SYS_RESOURCE) |
106 (1ULL << CAP_SYS_BOOT) |
107 (1ULL << CAP_AUDIT_WRITE) |
108 (1ULL << CAP_AUDIT_CONTROL);
109 static char **arg_bind = NULL;
110 static char **arg_bind_ro = NULL;
112 static int help(void) {
114 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
115 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
116 " -h --help Show this help\n"
117 " --version Print version string\n"
118 " -D --directory=NAME Root directory for the container\n"
119 " -b --boot Boot up full system (i.e. invoke init)\n"
120 " -u --user=USER Run the command under specified user or uid\n"
121 " -C --controllers=LIST Put the container in specified comma-separated\n"
122 " cgroup hierarchies\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " --private-network Disable network in container\n"
126 " --read-only Mount the root directory read-only\n"
127 " --capability=CAP In addition to the default, retain specified\n"
129 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
130 " -j Equivalent to --link-journal=host\n"
131 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
133 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
134 program_invocation_short_name);
139 static int parse_argv(int argc, char *argv[]) {
152 static const struct option options[] = {
153 { "help", no_argument, NULL, 'h' },
154 { "version", no_argument, NULL, ARG_VERSION },
155 { "directory", required_argument, NULL, 'D' },
156 { "user", required_argument, NULL, 'u' },
157 { "controllers", required_argument, NULL, 'C' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
159 { "boot", no_argument, NULL, 'b' },
160 { "uuid", required_argument, NULL, ARG_UUID },
161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
162 { "capability", required_argument, NULL, ARG_CAPABILITY },
163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
166 { "machine", required_argument, NULL, 'M' },
175 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
200 arg_user = strdup(optarg);
207 strv_free(arg_controllers);
208 arg_controllers = strv_split(optarg, ",");
209 if (!arg_controllers)
212 cg_shorten_controllers(arg_controllers);
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
228 if (!hostname_is_valid(optarg)) {
229 log_error("Invalid machine name: %s", optarg);
234 arg_machine = strdup(optarg);
241 arg_read_only = true;
244 case ARG_CAPABILITY: {
248 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
252 t = strndup(word, length);
256 if (cap_from_name(t, &cap) < 0) {
257 log_error("Failed to parse capability %s.", t);
263 arg_retain |= 1ULL << (uint64_t) cap;
270 arg_link_journal = LINK_GUEST;
273 case ARG_LINK_JOURNAL:
274 if (streq(optarg, "auto"))
275 arg_link_journal = LINK_AUTO;
276 else if (streq(optarg, "no"))
277 arg_link_journal = LINK_NO;
278 else if (streq(optarg, "guest"))
279 arg_link_journal = LINK_GUEST;
280 else if (streq(optarg, "host"))
281 arg_link_journal = LINK_HOST;
283 log_error("Failed to parse link journal mode %s", optarg);
291 _cleanup_free_ char *a = NULL, *b = NULL;
296 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298 e = strchr(optarg, ':');
300 a = strndup(optarg, e - optarg);
310 if (!path_is_absolute(a) || !path_is_absolute(b)) {
311 log_error("Invalid bind mount specification: %s", optarg);
315 r = strv_extend(x, a);
319 r = strv_extend(x, b);
330 log_error("Unknown option code %c", c);
338 static int mount_all(const char *dest) {
340 typedef struct MountPoint {
349 static const MountPoint mount_table[] = {
350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368 _cleanup_free_ char *where = NULL;
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
375 t = path_is_mount_point(where, true);
377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
389 mkdir_p(where, 0755);
391 if (mount(mount_table[k].what,
394 mount_table[k].flags,
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
398 log_error("mount(%s) failed: %m", where);
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
414 where = strjoin(dest, "/", *y, NULL);
418 mkdir_p_label(where, 0755);
420 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
421 log_error("mount(%s) failed: %m", where);
425 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
426 log_error("mount(%s) failed: %m", where);
434 static int setup_timezone(const char *dest) {
435 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
441 /* Fix the timezone, if possible */
442 r = readlink_malloc("/etc/localtime", &p);
444 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
448 z = path_startswith(p, "../usr/share/zoneinfo/");
450 z = path_startswith(p, "/usr/share/zoneinfo/");
452 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
456 where = strappend(dest, "/etc/localtime");
460 r = readlink_malloc(where, &q);
462 y = path_startswith(q, "../usr/share/zoneinfo/");
464 y = path_startswith(q, "/usr/share/zoneinfo/");
467 /* Already pointing to the right place? Then do nothing .. */
468 if (y && streq(y, z))
472 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
476 if (access(check, F_OK) < 0) {
477 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
481 what = strappend("../usr/share/zoneinfo/", z);
486 if (symlink(what, where) < 0) {
487 log_error("Failed to correct timezone of container: %m");
494 static int setup_resolv_conf(const char *dest) {
495 char _cleanup_free_ *where = NULL;
496 _cleanup_close_ int fd = -1;
500 if (arg_private_network)
503 /* Fix resolv.conf, if possible */
504 where = strappend(dest, "/etc/resolv.conf");
508 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
510 /* We don't really care for the results of this really. If it
511 * fails, it fails, but meh... */
512 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
513 log_warning("Failed to bind mount /etc/resolv.conf: %m");
515 if (mount("/etc/resolv.conf", where, "bind",
516 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
517 log_error("Failed to remount /etc/resolv.conf readonly: %m");
524 static int setup_boot_id(const char *dest) {
525 _cleanup_free_ char *from = NULL, *to = NULL;
532 /* Generate a new randomized boot ID, so that each boot-up of
533 * the container gets a new one */
535 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
536 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
540 r = sd_id128_randomize(&rnd);
542 log_error("Failed to generate random boot id: %s", strerror(-r));
546 snprintf(as_uuid, sizeof(as_uuid),
547 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
548 SD_ID128_FORMAT_VAL(rnd));
549 char_array_0(as_uuid);
551 r = write_string_file(from, as_uuid);
553 log_error("Failed to write boot id: %s", strerror(-r));
557 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
558 log_error("Failed to bind mount boot id: %m");
560 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
561 log_warning("Failed to make boot id read-only: %m");
567 static int copy_devnodes(const char *dest) {
569 static const char devnodes[] =
579 _cleanup_umask_ mode_t u;
585 NULSTR_FOREACH(d, devnodes) {
587 _cleanup_free_ char *from = NULL, *to = NULL;
589 asprintf(&from, "/dev/%s", d);
590 asprintf(&to, "%s/dev/%s", dest, d);
601 if (stat(from, &st) < 0) {
603 if (errno != ENOENT) {
604 log_error("Failed to stat %s: %m", from);
609 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
611 log_error("%s is not a char or block device, cannot copy", from);
615 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
617 log_error("mknod(%s) failed: %m", dest);
626 static int setup_ptmx(const char *dest) {
627 _cleanup_free_ char *p = NULL;
629 p = strappend(dest, "/dev/ptmx");
633 if (symlink("pts/ptmx", p) < 0) {
634 log_error("Failed to create /dev/ptmx symlink: %m");
641 static int setup_dev_console(const char *dest, const char *console) {
643 _cleanup_free_ char *to = NULL;
645 _cleanup_umask_ mode_t u;
652 if (stat(console, &st) < 0) {
653 log_error("Failed to stat %s: %m", console);
656 } else if (!S_ISCHR(st.st_mode)) {
657 log_error("/dev/console is not a char device");
661 r = chmod_and_chown(console, 0600, 0, 0);
663 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
667 if (asprintf(&to, "%s/dev/console", dest) < 0)
670 /* We need to bind mount the right tty to /dev/console since
671 * ptys can only exist on pts file systems. To have something
672 * to bind mount things on we create a device node first, that
673 * has the right major/minor (note that the major minor
674 * doesn't actually matter here, since we mount it over
677 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
678 log_error("mknod() for /dev/console failed: %m");
682 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
683 log_error("Bind mount for /dev/console failed: %m");
690 static int setup_kmsg(const char *dest, int kmsg_socket) {
691 _cleanup_free_ char *from = NULL, *to = NULL;
693 _cleanup_umask_ mode_t u;
695 struct cmsghdr cmsghdr;
696 uint8_t buf[CMSG_SPACE(sizeof(int))];
699 .msg_control = &control,
700 .msg_controllen = sizeof(control),
702 struct cmsghdr *cmsg;
705 assert(kmsg_socket >= 0);
709 /* We create the kmsg FIFO as /dev/kmsg, but immediately
710 * delete it after bind mounting it to /proc/kmsg. While FIFOs
711 * on the reading side behave very similar to /proc/kmsg,
712 * their writing side behaves differently from /dev/kmsg in
713 * that writing blocks when nothing is reading. In order to
714 * avoid any problems with containers deadlocking due to this
715 * we simply make /dev/kmsg unavailable to the container. */
716 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
717 asprintf(&to, "%s/proc/kmsg", dest) < 0)
720 if (mkfifo(from, 0600) < 0) {
721 log_error("mkfifo() for /dev/kmsg failed: %m");
725 r = chmod_and_chown(from, 0600, 0, 0);
727 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
731 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
732 log_error("Bind mount for /proc/kmsg failed: %m");
736 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
738 log_error("Failed to open fifo: %m");
742 cmsg = CMSG_FIRSTHDR(&mh);
743 cmsg->cmsg_level = SOL_SOCKET;
744 cmsg->cmsg_type = SCM_RIGHTS;
745 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
746 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
748 mh.msg_controllen = cmsg->cmsg_len;
750 /* Store away the fd in the socket, so that it stays open as
751 * long as we run the child */
752 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
753 close_nointr_nofail(fd);
756 log_error("Failed to send FIFO fd: %m");
760 /* And now make the FIFO unavailable as /dev/kmsg... */
765 static int setup_hostname(void) {
767 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
773 static int setup_journal(const char *directory) {
774 sd_id128_t machine_id;
775 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
779 if (arg_link_journal == LINK_NO)
782 p = strappend(directory, "/etc/machine-id");
786 r = read_one_line_file(p, &b);
787 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
790 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
795 if (isempty(id) && arg_link_journal == LINK_AUTO)
798 /* Verify validity */
799 r = sd_id128_from_string(id, &machine_id);
801 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
806 p = strappend("/var/log/journal/", id);
807 q = strjoin(directory, "/var/log/journal/", id, NULL);
811 if (path_is_mount_point(p, false) > 0) {
812 if (arg_link_journal != LINK_AUTO) {
813 log_error("%s: already a mount point, refusing to use for journal", p);
820 if (path_is_mount_point(q, false) > 0) {
821 if (arg_link_journal != LINK_AUTO) {
822 log_error("%s: already a mount point, refusing to use for journal", q);
829 r = readlink_and_make_absolute(p, &d);
831 if ((arg_link_journal == LINK_GUEST ||
832 arg_link_journal == LINK_AUTO) &&
835 r = mkdir_p(q, 0755);
837 log_warning("failed to create directory %s: %m", q);
842 log_error("Failed to remove symlink %s: %m", p);
845 } else if (r == -EINVAL) {
847 if (arg_link_journal == LINK_GUEST &&
850 if (errno == ENOTDIR) {
851 log_error("%s already exists and is neither a symlink nor a directory", p);
854 log_error("Failed to remove %s: %m", p);
858 } else if (r != -ENOENT) {
859 log_error("readlink(%s) failed: %m", p);
863 if (arg_link_journal == LINK_GUEST) {
865 if (symlink(q, p) < 0) {
866 log_error("Failed to symlink %s to %s: %m", q, p);
870 r = mkdir_p(q, 0755);
872 log_warning("failed to create directory %s: %m", q);
876 if (arg_link_journal == LINK_HOST) {
877 r = mkdir_p(p, 0755);
879 log_error("Failed to create %s: %m", p);
883 } else if (access(p, F_OK) < 0)
886 if (dir_is_empty(q) == 0) {
887 log_error("%s not empty.", q);
891 r = mkdir_p(q, 0755);
893 log_error("Failed to create %s: %m", q);
897 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
898 log_error("Failed to bind mount journal from host into guest: %m");
905 static int setup_cgroup(const char *path) {
909 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
911 log_error("Failed to create cgroup: %s", strerror(-r));
915 STRV_FOREACH(c, arg_controllers) {
916 r = cg_create_and_attach(*c, path, 1);
918 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
924 static int drop_capabilities(void) {
925 return capability_bounding_set_drop(~arg_retain, false);
928 static int process_pty(int master, pid_t pid, sigset_t *mask) {
930 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
931 size_t in_buffer_full = 0, out_buffer_full = 0;
932 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
933 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
934 int ep = -1, signal_fd = -1, r;
935 bool tried_orderly_shutdown = false;
941 fd_nonblock(STDIN_FILENO, 1);
942 fd_nonblock(STDOUT_FILENO, 1);
943 fd_nonblock(master, 1);
945 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
947 log_error("signalfd(): %m");
952 ep = epoll_create1(EPOLL_CLOEXEC);
954 log_error("Failed to create epoll: %m");
959 /* We read from STDIN only if this is actually a TTY,
960 * otherwise we assume non-interactivity. */
961 if (isatty(STDIN_FILENO)) {
963 stdin_ev.events = EPOLLIN|EPOLLET;
964 stdin_ev.data.fd = STDIN_FILENO;
966 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
967 log_error("Failed to register STDIN in epoll: %m");
974 stdout_ev.events = EPOLLOUT|EPOLLET;
975 stdout_ev.data.fd = STDOUT_FILENO;
978 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
979 master_ev.data.fd = master;
982 signal_ev.events = EPOLLIN;
983 signal_ev.data.fd = signal_fd;
985 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
986 if (errno != EPERM) {
987 log_error("Failed to register stdout in epoll: %m");
991 /* stdout without epoll support. Likely redirected to regular file. */
992 stdout_writable = true;
995 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
996 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
997 log_error("Failed to register fds in epoll: %m");
1003 struct epoll_event ev[16];
1007 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1010 if (errno == EINTR || errno == EAGAIN)
1013 log_error("epoll_wait(): %m");
1020 for (i = 0; i < nfds; i++) {
1021 if (ev[i].data.fd == STDIN_FILENO) {
1023 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1024 stdin_readable = true;
1026 } else if (ev[i].data.fd == STDOUT_FILENO) {
1028 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1029 stdout_writable = true;
1031 } else if (ev[i].data.fd == master) {
1033 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1034 master_readable = true;
1036 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1037 master_writable = true;
1039 } else if (ev[i].data.fd == signal_fd) {
1040 struct signalfd_siginfo sfsi;
1043 n = read(signal_fd, &sfsi, sizeof(sfsi));
1044 if (n != sizeof(sfsi)) {
1047 log_error("Failed to read from signalfd: invalid block size");
1052 if (errno != EINTR && errno != EAGAIN) {
1053 log_error("Failed to read from signalfd: %m");
1059 if (sfsi.ssi_signo == SIGWINCH) {
1062 /* The window size changed, let's forward that. */
1063 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1064 ioctl(master, TIOCSWINSZ, &ws);
1065 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1067 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1069 /* This only works for systemd... */
1070 tried_orderly_shutdown = true;
1071 kill(pid, SIGRTMIN+3);
1081 while ((stdin_readable && in_buffer_full <= 0) ||
1082 (master_writable && in_buffer_full > 0) ||
1083 (master_readable && out_buffer_full <= 0) ||
1084 (stdout_writable && out_buffer_full > 0)) {
1086 if (stdin_readable && in_buffer_full < LINE_MAX) {
1088 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1091 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1092 stdin_readable = false;
1094 log_error("read(): %m");
1099 in_buffer_full += (size_t) k;
1102 if (master_writable && in_buffer_full > 0) {
1104 k = write(master, in_buffer, in_buffer_full);
1107 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1108 master_writable = false;
1110 log_error("write(): %m");
1116 assert(in_buffer_full >= (size_t) k);
1117 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1118 in_buffer_full -= k;
1122 if (master_readable && out_buffer_full < LINE_MAX) {
1124 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1127 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1128 master_readable = false;
1130 log_error("read(): %m");
1135 out_buffer_full += (size_t) k;
1138 if (stdout_writable && out_buffer_full > 0) {
1140 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1143 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1144 stdout_writable = false;
1146 log_error("write(): %m");
1152 assert(out_buffer_full >= (size_t) k);
1153 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1154 out_buffer_full -= k;
1162 close_nointr_nofail(ep);
1165 close_nointr_nofail(signal_fd);
1170 int main(int argc, char *argv[]) {
1172 int r = EXIT_FAILURE, k;
1173 _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1174 _cleanup_close_ int master = -1;
1176 const char *console = NULL;
1177 struct termios saved_attr, raw_attr;
1179 bool saved_attr_valid = false;
1181 int kmsg_socket_pair[2] = { -1, -1 };
1184 log_parse_environment();
1187 r = parse_argv(argc, argv);
1191 if (arg_directory) {
1194 p = path_make_absolute_cwd(arg_directory);
1195 free(arg_directory);
1198 arg_directory = get_current_dir_name();
1200 if (!arg_directory) {
1201 log_error("Failed to determine path");
1205 path_kill_slashes(arg_directory);
1208 arg_machine = strdup(path_get_file_name(arg_directory));
1214 hostname_cleanup(arg_machine);
1215 if (isempty(arg_machine)) {
1216 log_error("Failed to determine machine name automatically, please use -M.");
1221 if (geteuid() != 0) {
1222 log_error("Need to be root.");
1226 if (sd_booted() <= 0) {
1227 log_error("Not running on a systemd system.");
1231 if (path_equal(arg_directory, "/")) {
1232 log_error("Spawning container on root directory not supported.");
1236 if (path_is_os_tree(arg_directory) <= 0) {
1237 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1242 n_fd_passed = sd_listen_fds(false);
1243 if (n_fd_passed > 0) {
1244 k = fdset_new_listen_fds(&fds, false);
1246 log_error("Failed to collect file descriptors: %s", strerror(-k));
1250 fdset_close_others(fds);
1253 k = cg_get_machine_path(&machine_root);
1255 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1259 newcg = strjoin(machine_root, "/", arg_machine, NULL);
1261 log_error("Failed to allocate cgroup path.");
1265 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1266 if (r <= 0 && r != -ENOENT) {
1267 log_error("Container already running.");
1275 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1277 log_error("Failed to acquire pseudo tty: %m");
1281 console = ptsname(master);
1283 log_error("Failed to determine tty name: %m");
1287 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1289 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1290 ioctl(master, TIOCSWINSZ, &ws);
1292 if (unlockpt(master) < 0) {
1293 log_error("Failed to unlock tty: %m");
1297 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1298 saved_attr_valid = true;
1300 raw_attr = saved_attr;
1301 cfmakeraw(&raw_attr);
1302 raw_attr.c_lflag &= ~ECHO;
1305 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1306 log_error("Failed to create kmsg socket pair.");
1310 assert_se(sigemptyset(&mask) == 0);
1311 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1312 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1318 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1319 log_error("pipe2(): %m");
1323 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1325 if (errno == EINVAL)
1326 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1328 log_error("clone() failed: %m");
1335 const char *home = NULL;
1336 uid_t uid = (uid_t) -1;
1337 gid_t gid = (gid_t) -1;
1339 const char *envp[] = {
1340 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1341 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1346 NULL, /* container_uuid */
1347 NULL, /* LISTEN_FDS */
1348 NULL, /* LISTEN_PID */
1352 envp[n_env] = strv_find_prefix(environ, "TERM=");
1356 close_nointr_nofail(pipefd[1]);
1357 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1358 close_nointr_nofail(pipefd[0]);
1360 close_nointr_nofail(master);
1363 if (saved_attr_valid) {
1364 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1365 log_error("Failed to set terminal attributes: %m");
1370 close_nointr(STDIN_FILENO);
1371 close_nointr(STDOUT_FILENO);
1372 close_nointr(STDERR_FILENO);
1374 close_nointr_nofail(kmsg_socket_pair[0]);
1375 kmsg_socket_pair[0] = -1;
1377 reset_all_signal_handlers();
1379 assert_se(sigemptyset(&mask) == 0);
1380 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1382 k = open_terminal(console, O_RDWR);
1383 if (k != STDIN_FILENO) {
1385 close_nointr_nofail(k);
1389 log_error("Failed to open console: %s", strerror(-k));
1393 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1394 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1395 log_error("Failed to duplicate console: %m");
1400 log_error("setsid() failed: %m");
1404 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1405 log_error("PR_SET_PDEATHSIG failed: %m");
1409 if (setup_cgroup(newcg) < 0)
1412 /* Mark everything as slave, so that we still
1413 * receive mounts from the real root, but don't
1414 * propagate mounts to the real root. */
1415 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1416 log_error("MS_SLAVE|MS_REC failed: %m");
1420 /* Turn directory into bind mount */
1421 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1422 log_error("Failed to make bind mount.");
1427 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1428 log_error("Failed to make read-only.");
1432 if (mount_all(arg_directory) < 0)
1435 if (copy_devnodes(arg_directory) < 0)
1438 if (setup_ptmx(arg_directory) < 0)
1441 dev_setup(arg_directory);
1443 if (setup_dev_console(arg_directory, console) < 0)
1446 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1449 close_nointr_nofail(kmsg_socket_pair[1]);
1450 kmsg_socket_pair[1] = -1;
1452 if (setup_boot_id(arg_directory) < 0)
1455 if (setup_timezone(arg_directory) < 0)
1458 if (setup_resolv_conf(arg_directory) < 0)
1461 if (setup_journal(arg_directory) < 0)
1464 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1467 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1470 if (chdir(arg_directory) < 0) {
1471 log_error("chdir(%s) failed: %m", arg_directory);
1475 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1476 log_error("mount(MS_MOVE) failed: %m");
1480 if (chroot(".") < 0) {
1481 log_error("chroot() failed: %m");
1485 if (chdir("/") < 0) {
1486 log_error("chdir() failed: %m");
1494 if (drop_capabilities() < 0) {
1495 log_error("drop_capabilities() failed: %m");
1501 /* Note that this resolves user names
1502 * inside the container, and hence
1503 * accesses the NSS modules from the
1504 * container and not the host. This is
1507 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1508 log_error("get_user_creds() failed: %m");
1512 if (mkdir_parents_label(home, 0775) < 0) {
1513 log_error("mkdir_parents_label() failed: %m");
1517 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1518 log_error("mkdir_safe_label() failed: %m");
1522 if (initgroups((const char*)arg_user, gid) < 0) {
1523 log_error("initgroups() failed: %m");
1527 if (setresgid(gid, gid, gid) < 0) {
1528 log_error("setregid() failed: %m");
1532 if (setresuid(uid, uid, uid) < 0) {
1533 log_error("setreuid() failed: %m");
1537 /* Reset everything fully to 0, just in case */
1539 if (setgroups(0, NULL) < 0) {
1540 log_error("setgroups() failed: %m");
1544 if (setresgid(0, 0, 0) < 0) {
1545 log_error("setregid() failed: %m");
1549 if (setresuid(0, 0, 0) < 0) {
1550 log_error("setreuid() failed: %m");
1555 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1556 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1557 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1563 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1569 if (fdset_size(fds) > 0) {
1570 k = fdset_cloexec(fds, false);
1572 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1576 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1577 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1589 /* Automatically search for the init system */
1591 l = 1 + argc - optind;
1592 a = newa(char*, l + 1);
1593 memcpy(a + 1, argv + optind, l * sizeof(char*));
1595 a[0] = (char*) "/usr/lib/systemd/systemd";
1596 execve(a[0], a, (char**) envp);
1598 a[0] = (char*) "/lib/systemd/systemd";
1599 execve(a[0], a, (char**) envp);
1601 a[0] = (char*) "/sbin/init";
1602 execve(a[0], a, (char**) envp);
1603 } else if (argc > optind)
1604 execvpe(argv[optind], argv + optind, (char**) envp);
1606 chdir(home ? home : "/root");
1607 execle("/bin/bash", "-bash", NULL, (char**) envp);
1610 log_error("execv() failed: %m");
1613 _exit(EXIT_FAILURE);
1616 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1617 close_nointr_nofail(pipefd[0]);
1618 close_nointr_nofail(pipefd[1]);
1623 if (process_pty(master, pid, &mask) < 0)
1626 if (saved_attr_valid)
1627 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1629 r = wait_for_terminate(pid, &status);
1635 if (status.si_code == CLD_EXITED) {
1636 if (status.si_status != 0) {
1637 log_error("Container failed with error code %i.", status.si_status);
1638 r = status.si_status;
1642 log_debug("Container exited successfully.");
1644 } else if (status.si_code == CLD_KILLED &&
1645 status.si_status == SIGINT) {
1646 log_info("Container has been shut down.");
1649 } else if (status.si_code == CLD_KILLED &&
1650 status.si_status == SIGHUP) {
1651 log_info("Container is being rebooted.");
1653 } else if (status.si_code == CLD_KILLED ||
1654 status.si_code == CLD_DUMPED) {
1656 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1660 log_error("Container failed due to unknown reason.");
1667 if (saved_attr_valid)
1668 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1670 close_pipe(kmsg_socket_pair);
1673 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1675 free(arg_directory);
1677 strv_free(arg_controllers);