1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
55 #include "cgroup-util.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
60 #include "dev-setup.h"
64 #include "bus-internal.h"
65 #include "bus-message.h"
71 typedef enum LinkJournal {
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
89 (1ULL << CAP_DAC_OVERRIDE) |
90 (1ULL << CAP_DAC_READ_SEARCH) |
91 (1ULL << CAP_FOWNER) |
92 (1ULL << CAP_FSETID) |
93 (1ULL << CAP_IPC_OWNER) |
96 (1ULL << CAP_LINUX_IMMUTABLE) |
97 (1ULL << CAP_NET_BIND_SERVICE) |
98 (1ULL << CAP_NET_BROADCAST) |
99 (1ULL << CAP_NET_RAW) |
100 (1ULL << CAP_SETGID) |
101 (1ULL << CAP_SETFCAP) |
102 (1ULL << CAP_SETPCAP) |
103 (1ULL << CAP_SETUID) |
104 (1ULL << CAP_SYS_ADMIN) |
105 (1ULL << CAP_SYS_CHROOT) |
106 (1ULL << CAP_SYS_NICE) |
107 (1ULL << CAP_SYS_PTRACE) |
108 (1ULL << CAP_SYS_TTY_CONFIG) |
109 (1ULL << CAP_SYS_RESOURCE) |
110 (1ULL << CAP_SYS_BOOT) |
111 (1ULL << CAP_AUDIT_WRITE) |
112 (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
116 static int help(void) {
118 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120 " -h --help Show this help\n"
121 " --version Print version string\n"
122 " -D --directory=NAME Root directory for the container\n"
123 " -b --boot Boot up full system (i.e. invoke init)\n"
124 " -u --user=USER Run the command under specified user or uid\n"
125 " --uuid=UUID Set a specific machine UUID for the container\n"
126 " -M --machine=NAME Set the machine name for the container\n"
127 " -S --slice=SLICE Place the container in the specified slice\n"
128 " --private-network Disable network in container\n"
129 " --read-only Mount the root directory read-only\n"
130 " --capability=CAP In addition to the default, retain specified\n"
132 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
133 " -j Equivalent to --link-journal=host\n"
134 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
136 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137 program_invocation_short_name);
142 static int parse_argv(int argc, char *argv[]) {
155 static const struct option options[] = {
156 { "help", no_argument, NULL, 'h' },
157 { "version", no_argument, NULL, ARG_VERSION },
158 { "directory", required_argument, NULL, 'D' },
159 { "user", required_argument, NULL, 'u' },
160 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
161 { "boot", no_argument, NULL, 'b' },
162 { "uuid", required_argument, NULL, ARG_UUID },
163 { "read-only", no_argument, NULL, ARG_READ_ONLY },
164 { "capability", required_argument, NULL, ARG_CAPABILITY },
165 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
166 { "bind", required_argument, NULL, ARG_BIND },
167 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
168 { "machine", required_argument, NULL, 'M' },
169 { "slice", required_argument, NULL, 'S' },
178 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
187 puts(PACKAGE_STRING);
188 puts(SYSTEMD_FEATURES);
193 arg_directory = canonicalize_file_name(optarg);
194 if (!arg_directory) {
195 log_error("Failed to canonicalize root directory.");
203 arg_user = strdup(optarg);
209 case ARG_PRIVATE_NETWORK:
210 arg_private_network = true;
218 r = sd_id128_from_string(optarg, &arg_uuid);
220 log_error("Invalid UUID: %s", optarg);
226 arg_slice = strdup(optarg);
230 if (!hostname_is_valid(optarg)) {
231 log_error("Invalid machine name: %s", optarg);
236 arg_machine = strdup(optarg);
243 arg_read_only = true;
246 case ARG_CAPABILITY: {
250 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
254 t = strndup(word, length);
258 if (cap_from_name(t, &cap) < 0) {
259 log_error("Failed to parse capability %s.", t);
265 arg_retain |= 1ULL << (uint64_t) cap;
272 arg_link_journal = LINK_GUEST;
275 case ARG_LINK_JOURNAL:
276 if (streq(optarg, "auto"))
277 arg_link_journal = LINK_AUTO;
278 else if (streq(optarg, "no"))
279 arg_link_journal = LINK_NO;
280 else if (streq(optarg, "guest"))
281 arg_link_journal = LINK_GUEST;
282 else if (streq(optarg, "host"))
283 arg_link_journal = LINK_HOST;
285 log_error("Failed to parse link journal mode %s", optarg);
293 _cleanup_free_ char *a = NULL, *b = NULL;
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
299 e = strchr(optarg, ':');
301 a = strndup(optarg, e - optarg);
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
316 r = strv_extend(x, a);
320 r = strv_extend(x, b);
331 log_error("Unknown option code %c", c);
339 static int mount_all(const char *dest) {
341 typedef struct MountPoint {
350 static const MountPoint mount_table[] = {
351 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
352 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
353 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
354 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
355 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
356 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
360 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
361 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
368 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369 _cleanup_free_ char *where = NULL;
372 where = strjoin(dest, "/", mount_table[k].where, NULL);
376 t = path_is_mount_point(where, true);
378 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
386 /* Skip this entry if it is not a remount. */
387 if (mount_table[k].what && t > 0)
390 mkdir_p(where, 0755);
392 if (mount(mount_table[k].what,
395 mount_table[k].flags,
396 mount_table[k].options) < 0 &&
397 mount_table[k].fatal) {
399 log_error("mount(%s) failed: %m", where);
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
412 STRV_FOREACH_PAIR(x, y, l) {
413 _cleanup_free_ char *where = NULL;
415 where = strjoin(dest, "/", *y, NULL);
419 mkdir_p_label(where, 0755);
421 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
422 log_error("mount(%s) failed: %m", where);
426 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
427 log_error("mount(%s) failed: %m", where);
435 static int setup_timezone(const char *dest) {
436 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
442 /* Fix the timezone, if possible */
443 r = readlink_malloc("/etc/localtime", &p);
445 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
449 z = path_startswith(p, "../usr/share/zoneinfo/");
451 z = path_startswith(p, "/usr/share/zoneinfo/");
453 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
457 where = strappend(dest, "/etc/localtime");
461 r = readlink_malloc(where, &q);
463 y = path_startswith(q, "../usr/share/zoneinfo/");
465 y = path_startswith(q, "/usr/share/zoneinfo/");
468 /* Already pointing to the right place? Then do nothing .. */
469 if (y && streq(y, z))
473 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
477 if (access(check, F_OK) < 0) {
478 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
482 what = strappend("../usr/share/zoneinfo/", z);
487 if (symlink(what, where) < 0) {
488 log_error("Failed to correct timezone of container: %m");
495 static int setup_resolv_conf(const char *dest) {
496 char _cleanup_free_ *where = NULL;
497 _cleanup_close_ int fd = -1;
501 if (arg_private_network)
504 /* Fix resolv.conf, if possible */
505 where = strappend(dest, "/etc/resolv.conf");
509 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
511 /* We don't really care for the results of this really. If it
512 * fails, it fails, but meh... */
513 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
514 log_warning("Failed to bind mount /etc/resolv.conf: %m");
516 if (mount("/etc/resolv.conf", where, "bind",
517 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
518 log_error("Failed to remount /etc/resolv.conf readonly: %m");
525 static int setup_boot_id(const char *dest) {
526 _cleanup_free_ char *from = NULL, *to = NULL;
533 /* Generate a new randomized boot ID, so that each boot-up of
534 * the container gets a new one */
536 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
537 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
541 r = sd_id128_randomize(&rnd);
543 log_error("Failed to generate random boot id: %s", strerror(-r));
547 snprintf(as_uuid, sizeof(as_uuid),
548 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
549 SD_ID128_FORMAT_VAL(rnd));
550 char_array_0(as_uuid);
552 r = write_string_file(from, as_uuid);
554 log_error("Failed to write boot id: %s", strerror(-r));
558 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
559 log_error("Failed to bind mount boot id: %m");
561 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
562 log_warning("Failed to make boot id read-only: %m");
568 static int copy_devnodes(const char *dest) {
570 static const char devnodes[] =
580 _cleanup_umask_ mode_t u;
586 NULSTR_FOREACH(d, devnodes) {
588 _cleanup_free_ char *from = NULL, *to = NULL;
590 asprintf(&from, "/dev/%s", d);
591 asprintf(&to, "%s/dev/%s", dest, d);
602 if (stat(from, &st) < 0) {
604 if (errno != ENOENT) {
605 log_error("Failed to stat %s: %m", from);
610 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
612 log_error("%s is not a char or block device, cannot copy", from);
616 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
618 log_error("mknod(%s) failed: %m", dest);
627 static int setup_ptmx(const char *dest) {
628 _cleanup_free_ char *p = NULL;
630 p = strappend(dest, "/dev/ptmx");
634 if (symlink("pts/ptmx", p) < 0) {
635 log_error("Failed to create /dev/ptmx symlink: %m");
642 static int setup_dev_console(const char *dest, const char *console) {
644 _cleanup_free_ char *to = NULL;
646 _cleanup_umask_ mode_t u;
653 if (stat(console, &st) < 0) {
654 log_error("Failed to stat %s: %m", console);
657 } else if (!S_ISCHR(st.st_mode)) {
658 log_error("/dev/console is not a char device");
662 r = chmod_and_chown(console, 0600, 0, 0);
664 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
668 if (asprintf(&to, "%s/dev/console", dest) < 0)
671 /* We need to bind mount the right tty to /dev/console since
672 * ptys can only exist on pts file systems. To have something
673 * to bind mount things on we create a device node first, that
674 * has the right major/minor (note that the major minor
675 * doesn't actually matter here, since we mount it over
678 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
679 log_error("mknod() for /dev/console failed: %m");
683 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
684 log_error("Bind mount for /dev/console failed: %m");
691 static int setup_kmsg(const char *dest, int kmsg_socket) {
692 _cleanup_free_ char *from = NULL, *to = NULL;
694 _cleanup_umask_ mode_t u;
696 struct cmsghdr cmsghdr;
697 uint8_t buf[CMSG_SPACE(sizeof(int))];
700 .msg_control = &control,
701 .msg_controllen = sizeof(control),
703 struct cmsghdr *cmsg;
706 assert(kmsg_socket >= 0);
710 /* We create the kmsg FIFO as /dev/kmsg, but immediately
711 * delete it after bind mounting it to /proc/kmsg. While FIFOs
712 * on the reading side behave very similar to /proc/kmsg,
713 * their writing side behaves differently from /dev/kmsg in
714 * that writing blocks when nothing is reading. In order to
715 * avoid any problems with containers deadlocking due to this
716 * we simply make /dev/kmsg unavailable to the container. */
717 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
718 asprintf(&to, "%s/proc/kmsg", dest) < 0)
721 if (mkfifo(from, 0600) < 0) {
722 log_error("mkfifo() for /dev/kmsg failed: %m");
726 r = chmod_and_chown(from, 0600, 0, 0);
728 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
732 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
733 log_error("Bind mount for /proc/kmsg failed: %m");
737 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
739 log_error("Failed to open fifo: %m");
743 cmsg = CMSG_FIRSTHDR(&mh);
744 cmsg->cmsg_level = SOL_SOCKET;
745 cmsg->cmsg_type = SCM_RIGHTS;
746 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
747 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
749 mh.msg_controllen = cmsg->cmsg_len;
751 /* Store away the fd in the socket, so that it stays open as
752 * long as we run the child */
753 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
754 close_nointr_nofail(fd);
757 log_error("Failed to send FIFO fd: %m");
761 /* And now make the FIFO unavailable as /dev/kmsg... */
766 static int setup_hostname(void) {
768 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
774 static int setup_journal(const char *directory) {
775 sd_id128_t machine_id;
776 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
780 if (arg_link_journal == LINK_NO)
783 p = strappend(directory, "/etc/machine-id");
787 r = read_one_line_file(p, &b);
788 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
791 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
796 if (isempty(id) && arg_link_journal == LINK_AUTO)
799 /* Verify validity */
800 r = sd_id128_from_string(id, &machine_id);
802 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
807 p = strappend("/var/log/journal/", id);
808 q = strjoin(directory, "/var/log/journal/", id, NULL);
812 if (path_is_mount_point(p, false) > 0) {
813 if (arg_link_journal != LINK_AUTO) {
814 log_error("%s: already a mount point, refusing to use for journal", p);
821 if (path_is_mount_point(q, false) > 0) {
822 if (arg_link_journal != LINK_AUTO) {
823 log_error("%s: already a mount point, refusing to use for journal", q);
830 r = readlink_and_make_absolute(p, &d);
832 if ((arg_link_journal == LINK_GUEST ||
833 arg_link_journal == LINK_AUTO) &&
836 r = mkdir_p(q, 0755);
838 log_warning("failed to create directory %s: %m", q);
843 log_error("Failed to remove symlink %s: %m", p);
846 } else if (r == -EINVAL) {
848 if (arg_link_journal == LINK_GUEST &&
851 if (errno == ENOTDIR) {
852 log_error("%s already exists and is neither a symlink nor a directory", p);
855 log_error("Failed to remove %s: %m", p);
859 } else if (r != -ENOENT) {
860 log_error("readlink(%s) failed: %m", p);
864 if (arg_link_journal == LINK_GUEST) {
866 if (symlink(q, p) < 0) {
867 log_error("Failed to symlink %s to %s: %m", q, p);
871 r = mkdir_p(q, 0755);
873 log_warning("failed to create directory %s: %m", q);
877 if (arg_link_journal == LINK_HOST) {
878 r = mkdir_p(p, 0755);
880 log_error("Failed to create %s: %m", p);
884 } else if (access(p, F_OK) < 0)
887 if (dir_is_empty(q) == 0) {
888 log_error("%s not empty.", q);
892 r = mkdir_p(q, 0755);
894 log_error("Failed to create %s: %m", q);
898 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
899 log_error("Failed to bind mount journal from host into guest: %m");
906 static int drop_capabilities(void) {
907 return capability_bounding_set_drop(~arg_retain, false);
910 static int process_pty(int master, pid_t pid, sigset_t *mask) {
912 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
913 size_t in_buffer_full = 0, out_buffer_full = 0;
914 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
915 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
916 int ep = -1, signal_fd = -1, r;
917 bool tried_orderly_shutdown = false;
923 fd_nonblock(STDIN_FILENO, 1);
924 fd_nonblock(STDOUT_FILENO, 1);
925 fd_nonblock(master, 1);
927 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
929 log_error("signalfd(): %m");
934 ep = epoll_create1(EPOLL_CLOEXEC);
936 log_error("Failed to create epoll: %m");
941 /* We read from STDIN only if this is actually a TTY,
942 * otherwise we assume non-interactivity. */
943 if (isatty(STDIN_FILENO)) {
945 stdin_ev.events = EPOLLIN|EPOLLET;
946 stdin_ev.data.fd = STDIN_FILENO;
948 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
949 log_error("Failed to register STDIN in epoll: %m");
956 stdout_ev.events = EPOLLOUT|EPOLLET;
957 stdout_ev.data.fd = STDOUT_FILENO;
960 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
961 master_ev.data.fd = master;
964 signal_ev.events = EPOLLIN;
965 signal_ev.data.fd = signal_fd;
967 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
968 if (errno != EPERM) {
969 log_error("Failed to register stdout in epoll: %m");
973 /* stdout without epoll support. Likely redirected to regular file. */
974 stdout_writable = true;
977 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
978 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
979 log_error("Failed to register fds in epoll: %m");
985 struct epoll_event ev[16];
989 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
992 if (errno == EINTR || errno == EAGAIN)
995 log_error("epoll_wait(): %m");
1002 for (i = 0; i < nfds; i++) {
1003 if (ev[i].data.fd == STDIN_FILENO) {
1005 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1006 stdin_readable = true;
1008 } else if (ev[i].data.fd == STDOUT_FILENO) {
1010 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1011 stdout_writable = true;
1013 } else if (ev[i].data.fd == master) {
1015 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1016 master_readable = true;
1018 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1019 master_writable = true;
1021 } else if (ev[i].data.fd == signal_fd) {
1022 struct signalfd_siginfo sfsi;
1025 n = read(signal_fd, &sfsi, sizeof(sfsi));
1026 if (n != sizeof(sfsi)) {
1029 log_error("Failed to read from signalfd: invalid block size");
1034 if (errno != EINTR && errno != EAGAIN) {
1035 log_error("Failed to read from signalfd: %m");
1041 if (sfsi.ssi_signo == SIGWINCH) {
1044 /* The window size changed, let's forward that. */
1045 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1046 ioctl(master, TIOCSWINSZ, &ws);
1047 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1049 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1051 /* This only works for systemd... */
1052 tried_orderly_shutdown = true;
1053 kill(pid, SIGRTMIN+3);
1063 while ((stdin_readable && in_buffer_full <= 0) ||
1064 (master_writable && in_buffer_full > 0) ||
1065 (master_readable && out_buffer_full <= 0) ||
1066 (stdout_writable && out_buffer_full > 0)) {
1068 if (stdin_readable && in_buffer_full < LINE_MAX) {
1070 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1073 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1074 stdin_readable = false;
1076 log_error("read(): %m");
1081 in_buffer_full += (size_t) k;
1084 if (master_writable && in_buffer_full > 0) {
1086 k = write(master, in_buffer, in_buffer_full);
1089 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1090 master_writable = false;
1092 log_error("write(): %m");
1098 assert(in_buffer_full >= (size_t) k);
1099 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1100 in_buffer_full -= k;
1104 if (master_readable && out_buffer_full < LINE_MAX) {
1106 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1109 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1110 master_readable = false;
1112 log_error("read(): %m");
1117 out_buffer_full += (size_t) k;
1120 if (stdout_writable && out_buffer_full > 0) {
1122 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1125 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1126 stdout_writable = false;
1128 log_error("write(): %m");
1134 assert(out_buffer_full >= (size_t) k);
1135 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1136 out_buffer_full -= k;
1144 close_nointr_nofail(ep);
1147 close_nointr_nofail(signal_fd);
1152 static int register_machine(void) {
1153 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1154 _cleanup_bus_unref_ sd_bus *bus = NULL;
1157 r = sd_bus_open_system(&bus);
1159 log_error("Failed to open system bus: %s", strerror(-r));
1163 r = sd_bus_call_method(
1165 "org.freedesktop.machine1",
1166 "/org/freedesktop/machine1",
1167 "org.freedesktop.machine1.Manager",
1173 SD_BUS_APPEND_ID128(arg_uuid),
1177 strempty(arg_directory),
1178 1, "Slice", "s", strempty(arg_slice));
1180 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
1187 static bool audit_enabled(void) {
1190 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1192 close_nointr_nofail(fd);
1198 int main(int argc, char *argv[]) {
1200 int r = EXIT_FAILURE, k;
1201 _cleanup_close_ int master = -1;
1203 const char *console = NULL;
1204 struct termios saved_attr, raw_attr;
1206 bool saved_attr_valid = false;
1208 int kmsg_socket_pair[2] = { -1, -1 };
1211 log_parse_environment();
1214 k = parse_argv(argc, argv);
1222 if (arg_directory) {
1225 p = path_make_absolute_cwd(arg_directory);
1226 free(arg_directory);
1229 arg_directory = get_current_dir_name();
1231 if (!arg_directory) {
1232 log_error("Failed to determine path, please use -D.");
1236 path_kill_slashes(arg_directory);
1239 arg_machine = strdup(path_get_file_name(arg_directory));
1245 hostname_cleanup(arg_machine, false);
1246 if (isempty(arg_machine)) {
1247 log_error("Failed to determine machine name automatically, please use -M.");
1252 if (geteuid() != 0) {
1253 log_error("Need to be root.");
1257 if (sd_booted() <= 0) {
1258 log_error("Not running on a systemd system.");
1262 if (arg_boot && audit_enabled()) {
1263 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1264 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1265 "line before using systemd-nspawn. Sleeping for 5s...\n");
1269 if (path_equal(arg_directory, "/")) {
1270 log_error("Spawning container on root directory not supported.");
1274 if (path_is_os_tree(arg_directory) <= 0) {
1275 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1280 n_fd_passed = sd_listen_fds(false);
1281 if (n_fd_passed > 0) {
1282 k = fdset_new_listen_fds(&fds, false);
1284 log_error("Failed to collect file descriptors: %s", strerror(-k));
1288 fdset_close_others(fds);
1291 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1293 log_error("Failed to acquire pseudo tty: %m");
1297 console = ptsname(master);
1299 log_error("Failed to determine tty name: %m");
1303 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1305 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1306 ioctl(master, TIOCSWINSZ, &ws);
1308 if (unlockpt(master) < 0) {
1309 log_error("Failed to unlock tty: %m");
1313 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1314 saved_attr_valid = true;
1316 raw_attr = saved_attr;
1317 cfmakeraw(&raw_attr);
1318 raw_attr.c_lflag &= ~ECHO;
1321 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1322 log_error("Failed to create kmsg socket pair.");
1326 sd_notify(0, "READY=1");
1328 assert_se(sigemptyset(&mask) == 0);
1329 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1330 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1334 int pipefd[2], pipefd2[2];
1336 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1337 log_error("pipe2(): %m");
1341 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1342 log_error("pipe2(): %m");
1347 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1349 if (errno == EINVAL)
1350 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1352 log_error("clone() failed: %m");
1359 const char *home = NULL;
1360 uid_t uid = (uid_t) -1;
1361 gid_t gid = (gid_t) -1;
1363 const char *envp[] = {
1364 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1365 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1370 NULL, /* container_uuid */
1371 NULL, /* LISTEN_FDS */
1372 NULL, /* LISTEN_PID */
1376 envp[n_env] = strv_find_prefix(environ, "TERM=");
1380 /* Wait for the parent process to log our PID */
1381 close_nointr_nofail(pipefd[1]);
1382 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1383 close_nointr_nofail(pipefd[0]);
1385 close_nointr_nofail(master);
1388 if (saved_attr_valid) {
1389 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1390 log_error("Failed to set terminal attributes: %m");
1395 close_nointr(STDIN_FILENO);
1396 close_nointr(STDOUT_FILENO);
1397 close_nointr(STDERR_FILENO);
1399 close_nointr_nofail(kmsg_socket_pair[0]);
1400 kmsg_socket_pair[0] = -1;
1402 reset_all_signal_handlers();
1404 assert_se(sigemptyset(&mask) == 0);
1405 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1407 k = open_terminal(console, O_RDWR);
1408 if (k != STDIN_FILENO) {
1410 close_nointr_nofail(k);
1414 log_error("Failed to open console: %s", strerror(-k));
1418 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1419 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1420 log_error("Failed to duplicate console: %m");
1425 log_error("setsid() failed: %m");
1429 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1430 log_error("PR_SET_PDEATHSIG failed: %m");
1434 close_pipe(pipefd2);
1436 r = register_machine();
1440 /* Mark everything as slave, so that we still
1441 * receive mounts from the real root, but don't
1442 * propagate mounts to the real root. */
1443 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1444 log_error("MS_SLAVE|MS_REC failed: %m");
1448 /* Turn directory into bind mount */
1449 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1450 log_error("Failed to make bind mount.");
1455 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1456 log_error("Failed to make read-only.");
1460 if (mount_all(arg_directory) < 0)
1463 if (copy_devnodes(arg_directory) < 0)
1466 if (setup_ptmx(arg_directory) < 0)
1469 dev_setup(arg_directory);
1471 if (setup_dev_console(arg_directory, console) < 0)
1474 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1477 close_nointr_nofail(kmsg_socket_pair[1]);
1478 kmsg_socket_pair[1] = -1;
1480 if (setup_boot_id(arg_directory) < 0)
1483 if (setup_timezone(arg_directory) < 0)
1486 if (setup_resolv_conf(arg_directory) < 0)
1489 if (setup_journal(arg_directory) < 0)
1492 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1495 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1498 if (chdir(arg_directory) < 0) {
1499 log_error("chdir(%s) failed: %m", arg_directory);
1503 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1504 log_error("mount(MS_MOVE) failed: %m");
1508 if (chroot(".") < 0) {
1509 log_error("chroot() failed: %m");
1513 if (chdir("/") < 0) {
1514 log_error("chdir() failed: %m");
1522 if (drop_capabilities() < 0) {
1523 log_error("drop_capabilities() failed: %m");
1529 /* Note that this resolves user names
1530 * inside the container, and hence
1531 * accesses the NSS modules from the
1532 * container and not the host. This is
1535 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1536 log_error("get_user_creds() failed: %m");
1540 if (mkdir_parents_label(home, 0775) < 0) {
1541 log_error("mkdir_parents_label() failed: %m");
1545 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1546 log_error("mkdir_safe_label() failed: %m");
1550 if (initgroups((const char*)arg_user, gid) < 0) {
1551 log_error("initgroups() failed: %m");
1555 if (setresgid(gid, gid, gid) < 0) {
1556 log_error("setregid() failed: %m");
1560 if (setresuid(uid, uid, uid) < 0) {
1561 log_error("setreuid() failed: %m");
1565 /* Reset everything fully to 0, just in case */
1567 if (setgroups(0, NULL) < 0) {
1568 log_error("setgroups() failed: %m");
1572 if (setresgid(0, 0, 0) < 0) {
1573 log_error("setregid() failed: %m");
1577 if (setresuid(0, 0, 0) < 0) {
1578 log_error("setreuid() failed: %m");
1583 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1584 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1585 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1590 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1591 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1597 if (fdset_size(fds) > 0) {
1598 k = fdset_cloexec(fds, false);
1600 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1604 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1605 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1617 /* Automatically search for the init system */
1619 l = 1 + argc - optind;
1620 a = newa(char*, l + 1);
1621 memcpy(a + 1, argv + optind, l * sizeof(char*));
1623 a[0] = (char*) "/usr/lib/systemd/systemd";
1624 execve(a[0], a, (char**) envp);
1626 a[0] = (char*) "/lib/systemd/systemd";
1627 execve(a[0], a, (char**) envp);
1629 a[0] = (char*) "/sbin/init";
1630 execve(a[0], a, (char**) envp);
1631 } else if (argc > optind)
1632 execvpe(argv[optind], argv + optind, (char**) envp);
1634 chdir(home ? home : "/root");
1635 execle("/bin/bash", "-bash", NULL, (char**) envp);
1638 log_error("execv() failed: %m");
1641 _exit(EXIT_FAILURE);
1644 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1645 close_nointr_nofail(pipefd[0]);
1646 close_nointr_nofail(pipefd[1]);
1648 /* Wait for the child process to establish cgroup hierarchy */
1649 close_nointr_nofail(pipefd2[1]);
1650 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1651 close_nointr_nofail(pipefd2[0]);
1656 if (process_pty(master, pid, &mask) < 0)
1659 if (saved_attr_valid)
1660 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1662 k = wait_for_terminate(pid, &status);
1668 if (status.si_code == CLD_EXITED) {
1669 r = status.si_status;
1670 if (status.si_status != 0) {
1671 log_error("Container failed with error code %i.", status.si_status);
1675 log_debug("Container exited successfully.");
1677 } else if (status.si_code == CLD_KILLED &&
1678 status.si_status == SIGINT) {
1679 log_info("Container has been shut down.");
1682 } else if (status.si_code == CLD_KILLED &&
1683 status.si_status == SIGHUP) {
1684 log_info("Container is being rebooted.");
1686 } else if (status.si_code == CLD_KILLED ||
1687 status.si_code == CLD_DUMPED) {
1689 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1693 log_error("Container failed due to unknown reason.");
1700 if (saved_attr_valid)
1701 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1703 close_pipe(kmsg_socket_pair);
1708 free(arg_directory);