1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
60 typedef enum LinkJournal {
67 static char *arg_directory = NULL;
68 static char *arg_user = NULL;
69 static char **arg_controllers = NULL;
70 static char *arg_uuid = NULL;
71 static bool arg_private_network = false;
72 static bool arg_read_only = false;
73 static bool arg_boot = false;
74 static LinkJournal arg_link_journal = LINK_AUTO;
75 static uint64_t arg_retain =
77 (1ULL << CAP_DAC_OVERRIDE) |
78 (1ULL << CAP_DAC_READ_SEARCH) |
79 (1ULL << CAP_FOWNER) |
80 (1ULL << CAP_FSETID) |
81 (1ULL << CAP_IPC_OWNER) |
84 (1ULL << CAP_LINUX_IMMUTABLE) |
85 (1ULL << CAP_NET_BIND_SERVICE) |
86 (1ULL << CAP_NET_BROADCAST) |
87 (1ULL << CAP_NET_RAW) |
88 (1ULL << CAP_SETGID) |
89 (1ULL << CAP_SETFCAP) |
90 (1ULL << CAP_SETPCAP) |
91 (1ULL << CAP_SETUID) |
92 (1ULL << CAP_SYS_ADMIN) |
93 (1ULL << CAP_SYS_CHROOT) |
94 (1ULL << CAP_SYS_NICE) |
95 (1ULL << CAP_SYS_PTRACE) |
96 (1ULL << CAP_SYS_TTY_CONFIG) |
97 (1ULL << CAP_SYS_RESOURCE) |
98 (1ULL << CAP_SYS_BOOT);
100 static int help(void) {
102 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
103 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
104 " -h --help Show this help\n"
105 " -D --directory=NAME Root directory for the container\n"
106 " -b --boot Boot up full system (i.e. invoke init)\n"
107 " -u --user=USER Run the command under specified user or uid\n"
108 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
109 " --uuid=UUID Set a specific machine UUID for the container\n"
110 " --private-network Disable network in container\n"
111 " --read-only Mount the root directory read-only\n"
112 " --capability=CAP In addition to the default, retain specified capability\n"
113 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
114 " -j Equivalent to --link-journal=host\n",
115 program_invocation_short_name);
120 static int parse_argv(int argc, char *argv[]) {
123 ARG_PRIVATE_NETWORK = 0x100,
130 static const struct option options[] = {
131 { "help", no_argument, NULL, 'h' },
132 { "directory", required_argument, NULL, 'D' },
133 { "user", required_argument, NULL, 'u' },
134 { "controllers", required_argument, NULL, 'C' },
135 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
136 { "boot", no_argument, NULL, 'b' },
137 { "uuid", required_argument, NULL, ARG_UUID },
138 { "read-only", no_argument, NULL, ARG_READ_ONLY },
139 { "capability", required_argument, NULL, ARG_CAPABILITY },
140 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
149 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
159 arg_directory = canonicalize_file_name(optarg);
160 if (!arg_directory) {
161 log_error("Failed to canonicalize root directory.");
169 if (!(arg_user = strdup(optarg))) {
170 log_error("Failed to duplicate user name.");
177 strv_free(arg_controllers);
178 arg_controllers = strv_split(optarg, ",");
179 if (!arg_controllers) {
180 log_error("Failed to split controllers list.");
183 strv_uniq(arg_controllers);
187 case ARG_PRIVATE_NETWORK:
188 arg_private_network = true;
200 arg_read_only = true;
203 case ARG_CAPABILITY: {
207 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
211 t = strndup(word, length);
215 if (cap_from_name(t, &cap) < 0) {
216 log_error("Failed to parse capability %s.", t);
222 arg_retain |= 1ULL << (uint64_t) cap;
229 arg_link_journal = LINK_GUEST;
232 case ARG_LINK_JOURNAL:
233 if (streq(optarg, "auto"))
234 arg_link_journal = LINK_AUTO;
235 else if (streq(optarg, "no"))
236 arg_link_journal = LINK_NO;
237 else if (streq(optarg, "guest"))
238 arg_link_journal = LINK_GUEST;
239 else if (streq(optarg, "host"))
240 arg_link_journal = LINK_HOST;
242 log_error("Failed to parse link journal mode %s", optarg);
252 log_error("Unknown option code %c", c);
260 static int mount_all(const char *dest) {
262 typedef struct MountPoint {
271 static const MountPoint mount_table[] = {
272 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
273 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
274 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
275 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
276 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
277 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
278 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
279 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
281 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
282 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
289 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
290 char _cleanup_free_ *where = NULL;
293 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
302 t = path_is_mount_point(where, true);
304 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
312 /* Skip this entry if it is not a remount. */
313 if (mount_table[k].what && t > 0)
316 mkdir_p_label(where, 0755);
318 if (mount(mount_table[k].what,
321 mount_table[k].flags,
322 mount_table[k].options) < 0 &&
323 mount_table[k].fatal) {
325 log_error("mount(%s) failed: %m", where);
335 static int setup_timezone(const char *dest) {
336 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
342 /* Fix the timezone, if possible */
343 r = readlink_malloc("/etc/localtime", &p);
345 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
349 z = path_startswith(p, "../usr/share/zoneinfo/");
351 z = path_startswith(p, "/usr/share/zoneinfo/");
353 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
357 where = strappend(dest, "/etc/localtime");
361 r = readlink_malloc(where, &q);
363 y = path_startswith(q, "../usr/share/zoneinfo/");
365 y = path_startswith(q, "/usr/share/zoneinfo/");
368 /* Already pointing to the right place? Then do nothing .. */
369 if (y && streq(y, z))
373 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
377 if (access(check, F_OK) < 0) {
378 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
382 what = strappend("../usr/share/zoneinfo/", z);
387 if (symlink(what, where) < 0) {
388 log_error("Failed to correct timezone of container: %m");
395 static int setup_resolv_conf(const char *dest) {
400 if (arg_private_network)
403 /* Fix resolv.conf, if possible */
404 where = strappend(dest, "/etc/resolv.conf");
408 /* We don't really care for the results of this really. If it
409 * fails, it fails, but meh... */
410 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
411 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
418 static int setup_boot_id(const char *dest) {
419 char _cleanup_free_ *from = NULL, *to = NULL;
426 /* Generate a new randomized boot ID, so that each boot-up of
427 * the container gets a new one */
429 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
430 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
434 r = sd_id128_randomize(&rnd);
436 log_error("Failed to generate random boot id: %s", strerror(-r));
440 snprintf(as_uuid, sizeof(as_uuid),
441 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
442 SD_ID128_FORMAT_VAL(rnd));
443 char_array_0(as_uuid);
445 r = write_one_line_file(from, as_uuid);
447 log_error("Failed to write boot id: %s", strerror(-r));
451 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
452 log_error("Failed to bind mount boot id: %m");
455 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
461 static int copy_devnodes(const char *dest) {
463 static const char devnodes[] =
474 mode_t _cleanup_umask_ u;
480 NULSTR_FOREACH(d, devnodes) {
482 char _cleanup_free_ *from = NULL, *to = NULL;
484 asprintf(&from, "/dev/%s", d);
485 asprintf(&to, "%s/dev/%s", dest, d);
496 if (stat(from, &st) < 0) {
498 if (errno != ENOENT) {
499 log_error("Failed to stat %s: %m", from);
504 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
506 log_error("%s is not a char or block device, cannot copy", from);
510 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
512 log_error("mknod(%s) failed: %m", dest);
521 static int setup_dev_console(const char *dest, const char *console) {
523 char _cleanup_free_ *to = NULL;
525 mode_t _cleanup_umask_ u;
532 if (stat(console, &st) < 0) {
533 log_error("Failed to stat %s: %m", console);
536 } else if (!S_ISCHR(st.st_mode)) {
537 log_error("/dev/console is not a char device");
541 r = chmod_and_chown(console, 0600, 0, 0);
543 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
547 if (asprintf(&to, "%s/dev/console", dest) < 0)
550 /* We need to bind mount the right tty to /dev/console since
551 * ptys can only exist on pts file systems. To have something
552 * to bind mount things on we create a device node first, that
553 * has the right major/minor (note that the major minor
554 * doesn't actually matter here, since we mount it over
557 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
558 log_error("mknod() for /dev/console failed: %m");
562 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
563 log_error("Bind mount for /dev/console failed: %m");
570 static int setup_kmsg(const char *dest, int kmsg_socket) {
571 char _cleanup_free_ *from = NULL, *to = NULL;
573 mode_t _cleanup_umask_ u;
575 struct cmsghdr cmsghdr;
576 uint8_t buf[CMSG_SPACE(sizeof(int))];
579 struct cmsghdr *cmsg;
582 assert(kmsg_socket >= 0);
586 /* We create the kmsg FIFO as /dev/kmsg, but immediately
587 * delete it after bind mounting it to /proc/kmsg. While FIFOs
588 * on the reading side behave very similar to /proc/kmsg,
589 * their writing side behaves differently from /dev/kmsg in
590 * that writing blocks when nothing is reading. In order to
591 * avoid any problems with containers deadlocking due to this
592 * we simply make /dev/kmsg unavailable to the container. */
593 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
594 asprintf(&to, "%s/proc/kmsg", dest) < 0)
597 if (mkfifo(from, 0600) < 0) {
598 log_error("mkfifo() for /dev/kmsg failed: %m");
602 r = chmod_and_chown(from, 0600, 0, 0);
604 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
608 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
609 log_error("Bind mount for /proc/kmsg failed: %m");
613 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
615 log_error("Failed to open fifo: %m");
622 mh.msg_control = &control;
623 mh.msg_controllen = sizeof(control);
625 cmsg = CMSG_FIRSTHDR(&mh);
626 cmsg->cmsg_level = SOL_SOCKET;
627 cmsg->cmsg_type = SCM_RIGHTS;
628 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
629 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
631 mh.msg_controllen = cmsg->cmsg_len;
633 /* Store away the fd in the socket, so that it stays open as
634 * long as we run the child */
635 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
636 close_nointr_nofail(fd);
639 log_error("Failed to send FIFO fd: %m");
643 /* And now make the FIFO unavailable as /dev/kmsg... */
648 static int setup_hostname(void) {
652 hn = path_get_file_name(arg_directory);
658 hostname_cleanup(hn);
661 if (sethostname(hn, strlen(hn)) < 0)
670 static int setup_journal(const char *directory) {
671 sd_id128_t machine_id;
672 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
676 if (arg_link_journal == LINK_NO)
679 p = strappend(directory, "/etc/machine-id");
683 r = read_one_line_file(p, &b);
684 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
687 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
692 if (isempty(id) && arg_link_journal == LINK_AUTO)
695 /* Verify validity */
696 r = sd_id128_from_string(id, &machine_id);
698 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
703 p = strappend("/var/log/journal/", id);
704 q = strjoin(directory, "/var/log/journal/", id, NULL);
708 if (path_is_mount_point(p, false) > 0) {
709 if (arg_link_journal != LINK_AUTO) {
710 log_error("%s: already a mount point, refusing to use for journal", p);
717 if (path_is_mount_point(q, false) > 0) {
718 if (arg_link_journal != LINK_AUTO) {
719 log_error("%s: already a mount point, refusing to use for journal", q);
726 r = readlink_and_make_absolute(p, &d);
728 if ((arg_link_journal == LINK_GUEST ||
729 arg_link_journal == LINK_AUTO) &&
732 r = mkdir_p(q, 0755);
734 log_warning("failed to create directory %s: %m", q);
739 log_error("Failed to remove symlink %s: %m", p);
742 } else if (r == -EINVAL) {
744 if (arg_link_journal == LINK_GUEST &&
747 if (errno == ENOTDIR) {
748 log_error("%s already exists and is neither a symlink nor a directory", p);
751 log_error("Failed to remove %s: %m", p);
755 } else if (r != -ENOENT) {
756 log_error("readlink(%s) failed: %m", p);
760 if (arg_link_journal == LINK_GUEST) {
762 if (symlink(q, p) < 0) {
763 log_error("Failed to symlink %s to %s: %m", q, p);
767 r = mkdir_p(q, 0755);
769 log_warning("failed to create directory %s: %m", q);
773 if (arg_link_journal == LINK_HOST) {
774 r = mkdir_p(p, 0755);
776 log_error("Failed to create %s: %m", p);
780 } else if (access(p, F_OK) < 0)
783 if (dir_is_empty(q) == 0) {
784 log_error("%s not empty.", q);
788 r = mkdir_p(q, 0755);
790 log_error("Failed to create %s: %m", q);
794 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
795 log_error("Failed to bind mount journal from host into guest: %m");
802 static int drop_capabilities(void) {
803 return capability_bounding_set_drop(~arg_retain, false);
806 static int is_os_tree(const char *path) {
809 /* We use /bin/sh as flag file if something is an OS */
811 if (asprintf(&p, "%s/bin/sh", path) < 0)
817 return r < 0 ? 0 : 1;
820 static int process_pty(int master, pid_t pid, sigset_t *mask) {
822 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
823 size_t in_buffer_full = 0, out_buffer_full = 0;
824 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
825 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
826 int ep = -1, signal_fd = -1, r;
827 bool tried_orderly_shutdown = false;
833 fd_nonblock(STDIN_FILENO, 1);
834 fd_nonblock(STDOUT_FILENO, 1);
835 fd_nonblock(master, 1);
837 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
839 log_error("signalfd(): %m");
844 ep = epoll_create1(EPOLL_CLOEXEC);
846 log_error("Failed to create epoll: %m");
851 /* We read from STDIN only if this is actually a TTY,
852 * otherwise we assume non-interactivity. */
853 if (isatty(STDIN_FILENO)) {
855 stdin_ev.events = EPOLLIN|EPOLLET;
856 stdin_ev.data.fd = STDIN_FILENO;
858 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
859 log_error("Failed to register STDIN in epoll: %m");
866 stdout_ev.events = EPOLLOUT|EPOLLET;
867 stdout_ev.data.fd = STDOUT_FILENO;
870 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
871 master_ev.data.fd = master;
874 signal_ev.events = EPOLLIN;
875 signal_ev.data.fd = signal_fd;
877 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
878 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
879 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
880 log_error("Failed to register fds in epoll: %m");
886 struct epoll_event ev[16];
890 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
893 if (errno == EINTR || errno == EAGAIN)
896 log_error("epoll_wait(): %m");
903 for (i = 0; i < nfds; i++) {
904 if (ev[i].data.fd == STDIN_FILENO) {
906 if (ev[i].events & (EPOLLIN|EPOLLHUP))
907 stdin_readable = true;
909 } else if (ev[i].data.fd == STDOUT_FILENO) {
911 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
912 stdout_writable = true;
914 } else if (ev[i].data.fd == master) {
916 if (ev[i].events & (EPOLLIN|EPOLLHUP))
917 master_readable = true;
919 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
920 master_writable = true;
922 } else if (ev[i].data.fd == signal_fd) {
923 struct signalfd_siginfo sfsi;
926 n = read(signal_fd, &sfsi, sizeof(sfsi));
927 if (n != sizeof(sfsi)) {
930 log_error("Failed to read from signalfd: invalid block size");
935 if (errno != EINTR && errno != EAGAIN) {
936 log_error("Failed to read from signalfd: %m");
942 if (sfsi.ssi_signo == SIGWINCH) {
945 /* The window size changed, let's forward that. */
946 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
947 ioctl(master, TIOCSWINSZ, &ws);
948 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
950 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
952 /* This only works for systemd... */
953 tried_orderly_shutdown = true;
954 kill(pid, SIGRTMIN+3);
964 while ((stdin_readable && in_buffer_full <= 0) ||
965 (master_writable && in_buffer_full > 0) ||
966 (master_readable && out_buffer_full <= 0) ||
967 (stdout_writable && out_buffer_full > 0)) {
969 if (stdin_readable && in_buffer_full < LINE_MAX) {
971 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
974 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
975 stdin_readable = false;
977 log_error("read(): %m");
982 in_buffer_full += (size_t) k;
985 if (master_writable && in_buffer_full > 0) {
987 k = write(master, in_buffer, in_buffer_full);
990 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
991 master_writable = false;
993 log_error("write(): %m");
999 assert(in_buffer_full >= (size_t) k);
1000 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1001 in_buffer_full -= k;
1005 if (master_readable && out_buffer_full < LINE_MAX) {
1007 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1010 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1011 master_readable = false;
1013 log_error("read(): %m");
1018 out_buffer_full += (size_t) k;
1021 if (stdout_writable && out_buffer_full > 0) {
1023 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1026 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1027 stdout_writable = false;
1029 log_error("write(): %m");
1035 assert(out_buffer_full >= (size_t) k);
1036 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1037 out_buffer_full -= k;
1045 close_nointr_nofail(ep);
1048 close_nointr_nofail(signal_fd);
1053 int main(int argc, char *argv[]) {
1055 int r = EXIT_FAILURE, k;
1056 char *oldcg = NULL, *newcg = NULL;
1057 char **controller = NULL;
1058 int master = -1, n_fd_passed;
1059 const char *console = NULL;
1060 struct termios saved_attr, raw_attr;
1062 bool saved_attr_valid = false;
1064 int kmsg_socket_pair[2] = { -1, -1 };
1067 log_parse_environment();
1070 r = parse_argv(argc, argv);
1074 if (arg_directory) {
1077 p = path_make_absolute_cwd(arg_directory);
1078 free(arg_directory);
1081 arg_directory = get_current_dir_name();
1083 if (!arg_directory) {
1084 log_error("Failed to determine path");
1088 path_kill_slashes(arg_directory);
1090 if (geteuid() != 0) {
1091 log_error("Need to be root.");
1095 if (sd_booted() <= 0) {
1096 log_error("Not running on a systemd system.");
1100 if (path_equal(arg_directory, "/")) {
1101 log_error("Spawning container on root directory not supported.");
1105 if (is_os_tree(arg_directory) <= 0) {
1106 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1111 n_fd_passed = sd_listen_fds(false);
1112 if (n_fd_passed > 0) {
1113 k = fdset_new_listen_fds(&fds, false);
1115 log_error("Failed to collect file descriptors: %s", strerror(-k));
1119 fdset_close_others(fds);
1122 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1124 log_error("Failed to determine current cgroup: %s", strerror(-k));
1128 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1129 log_error("Failed to allocate cgroup path.");
1133 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1135 log_error("Failed to create cgroup: %s", strerror(-k));
1139 STRV_FOREACH(controller, arg_controllers) {
1140 k = cg_create_and_attach(*controller, newcg, 0);
1142 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1145 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1147 log_error("Failed to acquire pseudo tty: %m");
1151 console = ptsname(master);
1153 log_error("Failed to determine tty name: %m");
1157 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1159 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1160 ioctl(master, TIOCSWINSZ, &ws);
1162 if (unlockpt(master) < 0) {
1163 log_error("Failed to unlock tty: %m");
1167 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1168 saved_attr_valid = true;
1170 raw_attr = saved_attr;
1171 cfmakeraw(&raw_attr);
1172 raw_attr.c_lflag &= ~ECHO;
1175 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1176 log_error("Failed to create kmsg socket pair");
1180 assert_se(sigemptyset(&mask) == 0);
1181 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1182 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1187 if (saved_attr_valid) {
1188 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1189 log_error("Failed to set terminal attributes: %m");
1194 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1196 if (errno == EINVAL)
1197 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1199 log_error("clone() failed: %m");
1207 const char *home = NULL;
1208 uid_t uid = (uid_t) -1;
1209 gid_t gid = (gid_t) -1;
1211 const char *envp[] = {
1212 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1213 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1218 NULL, /* container_uuid */
1219 NULL, /* LISTEN_FDS */
1220 NULL, /* LISTEN_PID */
1224 envp[2] = strv_find_prefix(environ, "TERM=");
1227 close_nointr_nofail(master);
1230 close_nointr(STDIN_FILENO);
1231 close_nointr(STDOUT_FILENO);
1232 close_nointr(STDERR_FILENO);
1234 close_nointr_nofail(kmsg_socket_pair[0]);
1235 kmsg_socket_pair[0] = -1;
1237 reset_all_signal_handlers();
1239 assert_se(sigemptyset(&mask) == 0);
1240 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1242 k = open_terminal(console, O_RDWR);
1243 if (k != STDIN_FILENO) {
1245 close_nointr_nofail(k);
1249 log_error("Failed to open console: %s", strerror(-k));
1253 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1254 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1255 log_error("Failed to duplicate console: %m");
1260 log_error("setsid() failed: %m");
1264 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1265 log_error("PR_SET_PDEATHSIG failed: %m");
1269 /* Mark everything as slave, so that we still
1270 * receive mounts from the real root, but don't
1271 * propagate mounts to the real root. */
1272 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1273 log_error("MS_SLAVE|MS_REC failed: %m");
1277 /* Turn directory into bind mount */
1278 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1279 log_error("Failed to make bind mount.");
1284 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1285 log_error("Failed to make read-only.");
1289 if (mount_all(arg_directory) < 0)
1292 if (copy_devnodes(arg_directory) < 0)
1295 dev_setup(arg_directory);
1297 if (setup_dev_console(arg_directory, console) < 0)
1300 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1303 close_nointr_nofail(kmsg_socket_pair[1]);
1304 kmsg_socket_pair[1] = -1;
1306 if (setup_boot_id(arg_directory) < 0)
1309 if (setup_timezone(arg_directory) < 0)
1312 if (setup_resolv_conf(arg_directory) < 0)
1315 if (setup_journal(arg_directory) < 0)
1318 if (chdir(arg_directory) < 0) {
1319 log_error("chdir(%s) failed: %m", arg_directory);
1323 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1324 log_error("mount(MS_MOVE) failed: %m");
1328 if (chroot(".") < 0) {
1329 log_error("chroot() failed: %m");
1333 if (chdir("/") < 0) {
1334 log_error("chdir() failed: %m");
1342 if (drop_capabilities() < 0) {
1343 log_error("drop_capabilities() failed: %m");
1349 /* Note that this resolves user names
1350 * inside the container, and hence
1351 * accesses the NSS modules from the
1352 * container and not the host. This is
1355 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1356 log_error("get_user_creds() failed: %m");
1360 if (mkdir_parents_label(home, 0775) < 0) {
1361 log_error("mkdir_parents_label() failed: %m");
1365 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1366 log_error("mkdir_safe_label() failed: %m");
1370 if (initgroups((const char*)arg_user, gid) < 0) {
1371 log_error("initgroups() failed: %m");
1375 if (setresgid(gid, gid, gid) < 0) {
1376 log_error("setregid() failed: %m");
1380 if (setresuid(uid, uid, uid) < 0) {
1381 log_error("setreuid() failed: %m");
1385 /* Reset everything fully to 0, just in case */
1387 if (setgroups(0, NULL) < 0) {
1388 log_error("setgroups() failed: %m");
1392 if (setresgid(0, 0, 0) < 0) {
1393 log_error("setregid() failed: %m");
1397 if (setresuid(0, 0, 0) < 0) {
1398 log_error("setreuid() failed: %m");
1403 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1404 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1405 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1411 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1417 if (fdset_size(fds) > 0) {
1418 k = fdset_cloexec(fds, false);
1420 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1424 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1425 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1437 /* Automatically search for the init system */
1439 l = 1 + argc - optind;
1440 a = newa(char*, l + 1);
1441 memcpy(a + 1, argv + optind, l * sizeof(char*));
1443 a[0] = (char*) "/usr/lib/systemd/systemd";
1444 execve(a[0], a, (char**) envp);
1446 a[0] = (char*) "/lib/systemd/systemd";
1447 execve(a[0], a, (char**) envp);
1449 a[0] = (char*) "/sbin/init";
1450 execve(a[0], a, (char**) envp);
1451 } else if (argc > optind)
1452 execvpe(argv[optind], argv + optind, (char**) envp);
1454 chdir(home ? home : "/root");
1455 execle("/bin/bash", "-bash", NULL, (char**) envp);
1458 log_error("execv() failed: %m");
1461 _exit(EXIT_FAILURE);
1467 if (process_pty(master, pid, &mask) < 0)
1470 if (saved_attr_valid)
1471 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1473 r = wait_for_terminate(pid, &status);
1479 if (status.si_code == CLD_EXITED) {
1480 if (status.si_status != 0) {
1481 log_error("Container failed with error code %i.", status.si_status);
1482 r = status.si_status;
1486 log_debug("Container exited successfully.");
1488 } else if (status.si_code == CLD_KILLED &&
1489 status.si_status == SIGINT) {
1490 log_info("Container has been shut down.");
1493 } else if (status.si_code == CLD_KILLED &&
1494 status.si_status == SIGHUP) {
1495 log_info("Container is being rebooted.");
1497 } else if (status.si_code == CLD_KILLED ||
1498 status.si_code == CLD_DUMPED) {
1500 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1504 log_error("Container failed due to unknown reason.");
1511 if (saved_attr_valid)
1512 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1515 close_nointr_nofail(master);
1517 close_pipe(kmsg_socket_pair);
1520 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1523 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1525 free(arg_directory);
1526 strv_free(arg_controllers);