1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
52 #include "cgroup-util.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
57 #include "dev-setup.h"
61 typedef enum LinkJournal {
68 static char *arg_directory = NULL;
69 static char *arg_user = NULL;
70 static char **arg_controllers = NULL;
71 static char *arg_uuid = NULL;
72 static bool arg_private_network = false;
73 static bool arg_read_only = false;
74 static bool arg_boot = false;
75 static LinkJournal arg_link_journal = LINK_AUTO;
76 static uint64_t arg_retain =
78 (1ULL << CAP_DAC_OVERRIDE) |
79 (1ULL << CAP_DAC_READ_SEARCH) |
80 (1ULL << CAP_FOWNER) |
81 (1ULL << CAP_FSETID) |
82 (1ULL << CAP_IPC_OWNER) |
85 (1ULL << CAP_LINUX_IMMUTABLE) |
86 (1ULL << CAP_NET_BIND_SERVICE) |
87 (1ULL << CAP_NET_BROADCAST) |
88 (1ULL << CAP_NET_RAW) |
89 (1ULL << CAP_SETGID) |
90 (1ULL << CAP_SETFCAP) |
91 (1ULL << CAP_SETPCAP) |
92 (1ULL << CAP_SETUID) |
93 (1ULL << CAP_SYS_ADMIN) |
94 (1ULL << CAP_SYS_CHROOT) |
95 (1ULL << CAP_SYS_NICE) |
96 (1ULL << CAP_SYS_PTRACE) |
97 (1ULL << CAP_SYS_TTY_CONFIG) |
98 (1ULL << CAP_SYS_RESOURCE) |
99 (1ULL << CAP_SYS_BOOT);
101 static int help(void) {
103 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
104 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
105 " -h --help Show this help\n"
106 " --version Print version string\n"
107 " -D --directory=NAME Root directory for the container\n"
108 " -b --boot Boot up full system (i.e. invoke init)\n"
109 " -u --user=USER Run the command under specified user or uid\n"
110 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
111 " --uuid=UUID Set a specific machine UUID for the container\n"
112 " --private-network Disable network in container\n"
113 " --read-only Mount the root directory read-only\n"
114 " --capability=CAP In addition to the default, retain specified capability\n"
115 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
116 " -j Equivalent to --link-journal=host\n",
117 program_invocation_short_name);
122 static int parse_argv(int argc, char *argv[]) {
133 static const struct option options[] = {
134 { "help", no_argument, NULL, 'h' },
135 { "version", no_argument, NULL, ARG_VERSION },
136 { "directory", required_argument, NULL, 'D' },
137 { "user", required_argument, NULL, 'u' },
138 { "controllers", required_argument, NULL, 'C' },
139 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
140 { "boot", no_argument, NULL, 'b' },
141 { "uuid", required_argument, NULL, ARG_UUID },
142 { "read-only", no_argument, NULL, ARG_READ_ONLY },
143 { "capability", required_argument, NULL, ARG_CAPABILITY },
144 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
153 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
162 puts(PACKAGE_STRING);
163 puts(SYSTEMD_FEATURES);
168 arg_directory = canonicalize_file_name(optarg);
169 if (!arg_directory) {
170 log_error("Failed to canonicalize root directory.");
178 if (!(arg_user = strdup(optarg))) {
179 log_error("Failed to duplicate user name.");
186 strv_free(arg_controllers);
187 arg_controllers = strv_split(optarg, ",");
188 if (!arg_controllers) {
189 log_error("Failed to split controllers list.");
192 strv_uniq(arg_controllers);
196 case ARG_PRIVATE_NETWORK:
197 arg_private_network = true;
209 arg_read_only = true;
212 case ARG_CAPABILITY: {
216 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
220 t = strndup(word, length);
224 if (cap_from_name(t, &cap) < 0) {
225 log_error("Failed to parse capability %s.", t);
231 arg_retain |= 1ULL << (uint64_t) cap;
238 arg_link_journal = LINK_GUEST;
241 case ARG_LINK_JOURNAL:
242 if (streq(optarg, "auto"))
243 arg_link_journal = LINK_AUTO;
244 else if (streq(optarg, "no"))
245 arg_link_journal = LINK_NO;
246 else if (streq(optarg, "guest"))
247 arg_link_journal = LINK_GUEST;
248 else if (streq(optarg, "host"))
249 arg_link_journal = LINK_HOST;
251 log_error("Failed to parse link journal mode %s", optarg);
261 log_error("Unknown option code %c", c);
269 static int mount_all(const char *dest) {
271 typedef struct MountPoint {
280 static const MountPoint mount_table[] = {
281 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
282 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
283 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
284 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
285 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
286 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
287 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
288 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
290 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
291 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
298 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
299 char _cleanup_free_ *where = NULL;
302 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
311 t = path_is_mount_point(where, true);
313 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
321 /* Skip this entry if it is not a remount. */
322 if (mount_table[k].what && t > 0)
325 mkdir_p_label(where, 0755);
327 if (mount(mount_table[k].what,
330 mount_table[k].flags,
331 mount_table[k].options) < 0 &&
332 mount_table[k].fatal) {
334 log_error("mount(%s) failed: %m", where);
344 static int setup_timezone(const char *dest) {
345 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
351 /* Fix the timezone, if possible */
352 r = readlink_malloc("/etc/localtime", &p);
354 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
358 z = path_startswith(p, "../usr/share/zoneinfo/");
360 z = path_startswith(p, "/usr/share/zoneinfo/");
362 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
366 where = strappend(dest, "/etc/localtime");
370 r = readlink_malloc(where, &q);
372 y = path_startswith(q, "../usr/share/zoneinfo/");
374 y = path_startswith(q, "/usr/share/zoneinfo/");
377 /* Already pointing to the right place? Then do nothing .. */
378 if (y && streq(y, z))
382 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
386 if (access(check, F_OK) < 0) {
387 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
391 what = strappend("../usr/share/zoneinfo/", z);
396 if (symlink(what, where) < 0) {
397 log_error("Failed to correct timezone of container: %m");
404 static int setup_resolv_conf(const char *dest) {
409 if (arg_private_network)
412 /* Fix resolv.conf, if possible */
413 where = strappend(dest, "/etc/resolv.conf");
417 /* We don't really care for the results of this really. If it
418 * fails, it fails, but meh... */
419 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
420 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
427 static int setup_boot_id(const char *dest) {
428 char _cleanup_free_ *from = NULL, *to = NULL;
435 /* Generate a new randomized boot ID, so that each boot-up of
436 * the container gets a new one */
438 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
439 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
443 r = sd_id128_randomize(&rnd);
445 log_error("Failed to generate random boot id: %s", strerror(-r));
449 snprintf(as_uuid, sizeof(as_uuid),
450 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
451 SD_ID128_FORMAT_VAL(rnd));
452 char_array_0(as_uuid);
454 r = write_one_line_file(from, as_uuid);
456 log_error("Failed to write boot id: %s", strerror(-r));
460 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
461 log_error("Failed to bind mount boot id: %m");
464 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
470 static int copy_devnodes(const char *dest) {
472 static const char devnodes[] =
483 mode_t _cleanup_umask_ u;
489 NULSTR_FOREACH(d, devnodes) {
491 char _cleanup_free_ *from = NULL, *to = NULL;
493 asprintf(&from, "/dev/%s", d);
494 asprintf(&to, "%s/dev/%s", dest, d);
505 if (stat(from, &st) < 0) {
507 if (errno != ENOENT) {
508 log_error("Failed to stat %s: %m", from);
513 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
515 log_error("%s is not a char or block device, cannot copy", from);
519 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
521 log_error("mknod(%s) failed: %m", dest);
530 static int setup_dev_console(const char *dest, const char *console) {
532 char _cleanup_free_ *to = NULL;
534 mode_t _cleanup_umask_ u;
541 if (stat(console, &st) < 0) {
542 log_error("Failed to stat %s: %m", console);
545 } else if (!S_ISCHR(st.st_mode)) {
546 log_error("/dev/console is not a char device");
550 r = chmod_and_chown(console, 0600, 0, 0);
552 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
556 if (asprintf(&to, "%s/dev/console", dest) < 0)
559 /* We need to bind mount the right tty to /dev/console since
560 * ptys can only exist on pts file systems. To have something
561 * to bind mount things on we create a device node first, that
562 * has the right major/minor (note that the major minor
563 * doesn't actually matter here, since we mount it over
566 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
567 log_error("mknod() for /dev/console failed: %m");
571 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
572 log_error("Bind mount for /dev/console failed: %m");
579 static int setup_kmsg(const char *dest, int kmsg_socket) {
580 char _cleanup_free_ *from = NULL, *to = NULL;
582 mode_t _cleanup_umask_ u;
584 struct cmsghdr cmsghdr;
585 uint8_t buf[CMSG_SPACE(sizeof(int))];
588 struct cmsghdr *cmsg;
591 assert(kmsg_socket >= 0);
595 /* We create the kmsg FIFO as /dev/kmsg, but immediately
596 * delete it after bind mounting it to /proc/kmsg. While FIFOs
597 * on the reading side behave very similar to /proc/kmsg,
598 * their writing side behaves differently from /dev/kmsg in
599 * that writing blocks when nothing is reading. In order to
600 * avoid any problems with containers deadlocking due to this
601 * we simply make /dev/kmsg unavailable to the container. */
602 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
603 asprintf(&to, "%s/proc/kmsg", dest) < 0)
606 if (mkfifo(from, 0600) < 0) {
607 log_error("mkfifo() for /dev/kmsg failed: %m");
611 r = chmod_and_chown(from, 0600, 0, 0);
613 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
617 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
618 log_error("Bind mount for /proc/kmsg failed: %m");
622 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
624 log_error("Failed to open fifo: %m");
631 mh.msg_control = &control;
632 mh.msg_controllen = sizeof(control);
634 cmsg = CMSG_FIRSTHDR(&mh);
635 cmsg->cmsg_level = SOL_SOCKET;
636 cmsg->cmsg_type = SCM_RIGHTS;
637 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
638 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
640 mh.msg_controllen = cmsg->cmsg_len;
642 /* Store away the fd in the socket, so that it stays open as
643 * long as we run the child */
644 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
645 close_nointr_nofail(fd);
648 log_error("Failed to send FIFO fd: %m");
652 /* And now make the FIFO unavailable as /dev/kmsg... */
657 static int setup_hostname(void) {
661 hn = path_get_file_name(arg_directory);
667 hostname_cleanup(hn);
670 if (sethostname(hn, strlen(hn)) < 0)
679 static int setup_journal(const char *directory) {
680 sd_id128_t machine_id;
681 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
685 if (arg_link_journal == LINK_NO)
688 p = strappend(directory, "/etc/machine-id");
692 r = read_one_line_file(p, &b);
693 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
696 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
701 if (isempty(id) && arg_link_journal == LINK_AUTO)
704 /* Verify validity */
705 r = sd_id128_from_string(id, &machine_id);
707 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
712 p = strappend("/var/log/journal/", id);
713 q = strjoin(directory, "/var/log/journal/", id, NULL);
717 if (path_is_mount_point(p, false) > 0) {
718 if (arg_link_journal != LINK_AUTO) {
719 log_error("%s: already a mount point, refusing to use for journal", p);
726 if (path_is_mount_point(q, false) > 0) {
727 if (arg_link_journal != LINK_AUTO) {
728 log_error("%s: already a mount point, refusing to use for journal", q);
735 r = readlink_and_make_absolute(p, &d);
737 if ((arg_link_journal == LINK_GUEST ||
738 arg_link_journal == LINK_AUTO) &&
741 r = mkdir_p(q, 0755);
743 log_warning("failed to create directory %s: %m", q);
748 log_error("Failed to remove symlink %s: %m", p);
751 } else if (r == -EINVAL) {
753 if (arg_link_journal == LINK_GUEST &&
756 if (errno == ENOTDIR) {
757 log_error("%s already exists and is neither a symlink nor a directory", p);
760 log_error("Failed to remove %s: %m", p);
764 } else if (r != -ENOENT) {
765 log_error("readlink(%s) failed: %m", p);
769 if (arg_link_journal == LINK_GUEST) {
771 if (symlink(q, p) < 0) {
772 log_error("Failed to symlink %s to %s: %m", q, p);
776 r = mkdir_p(q, 0755);
778 log_warning("failed to create directory %s: %m", q);
782 if (arg_link_journal == LINK_HOST) {
783 r = mkdir_p(p, 0755);
785 log_error("Failed to create %s: %m", p);
789 } else if (access(p, F_OK) < 0)
792 if (dir_is_empty(q) == 0) {
793 log_error("%s not empty.", q);
797 r = mkdir_p(q, 0755);
799 log_error("Failed to create %s: %m", q);
803 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
804 log_error("Failed to bind mount journal from host into guest: %m");
811 static int drop_capabilities(void) {
812 return capability_bounding_set_drop(~arg_retain, false);
815 static int is_os_tree(const char *path) {
818 /* We use /bin/sh as flag file if something is an OS */
820 if (asprintf(&p, "%s/bin/sh", path) < 0)
826 return r < 0 ? 0 : 1;
829 static int process_pty(int master, pid_t pid, sigset_t *mask) {
831 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
832 size_t in_buffer_full = 0, out_buffer_full = 0;
833 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
834 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
835 int ep = -1, signal_fd = -1, r;
836 bool tried_orderly_shutdown = false;
842 fd_nonblock(STDIN_FILENO, 1);
843 fd_nonblock(STDOUT_FILENO, 1);
844 fd_nonblock(master, 1);
846 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
848 log_error("signalfd(): %m");
853 ep = epoll_create1(EPOLL_CLOEXEC);
855 log_error("Failed to create epoll: %m");
860 /* We read from STDIN only if this is actually a TTY,
861 * otherwise we assume non-interactivity. */
862 if (isatty(STDIN_FILENO)) {
864 stdin_ev.events = EPOLLIN|EPOLLET;
865 stdin_ev.data.fd = STDIN_FILENO;
867 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
868 log_error("Failed to register STDIN in epoll: %m");
875 stdout_ev.events = EPOLLOUT|EPOLLET;
876 stdout_ev.data.fd = STDOUT_FILENO;
879 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
880 master_ev.data.fd = master;
883 signal_ev.events = EPOLLIN;
884 signal_ev.data.fd = signal_fd;
886 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
887 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
888 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
889 log_error("Failed to register fds in epoll: %m");
895 struct epoll_event ev[16];
899 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
902 if (errno == EINTR || errno == EAGAIN)
905 log_error("epoll_wait(): %m");
912 for (i = 0; i < nfds; i++) {
913 if (ev[i].data.fd == STDIN_FILENO) {
915 if (ev[i].events & (EPOLLIN|EPOLLHUP))
916 stdin_readable = true;
918 } else if (ev[i].data.fd == STDOUT_FILENO) {
920 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
921 stdout_writable = true;
923 } else if (ev[i].data.fd == master) {
925 if (ev[i].events & (EPOLLIN|EPOLLHUP))
926 master_readable = true;
928 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
929 master_writable = true;
931 } else if (ev[i].data.fd == signal_fd) {
932 struct signalfd_siginfo sfsi;
935 n = read(signal_fd, &sfsi, sizeof(sfsi));
936 if (n != sizeof(sfsi)) {
939 log_error("Failed to read from signalfd: invalid block size");
944 if (errno != EINTR && errno != EAGAIN) {
945 log_error("Failed to read from signalfd: %m");
951 if (sfsi.ssi_signo == SIGWINCH) {
954 /* The window size changed, let's forward that. */
955 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
956 ioctl(master, TIOCSWINSZ, &ws);
957 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
959 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
961 /* This only works for systemd... */
962 tried_orderly_shutdown = true;
963 kill(pid, SIGRTMIN+3);
973 while ((stdin_readable && in_buffer_full <= 0) ||
974 (master_writable && in_buffer_full > 0) ||
975 (master_readable && out_buffer_full <= 0) ||
976 (stdout_writable && out_buffer_full > 0)) {
978 if (stdin_readable && in_buffer_full < LINE_MAX) {
980 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
983 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
984 stdin_readable = false;
986 log_error("read(): %m");
991 in_buffer_full += (size_t) k;
994 if (master_writable && in_buffer_full > 0) {
996 k = write(master, in_buffer, in_buffer_full);
999 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1000 master_writable = false;
1002 log_error("write(): %m");
1008 assert(in_buffer_full >= (size_t) k);
1009 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1010 in_buffer_full -= k;
1014 if (master_readable && out_buffer_full < LINE_MAX) {
1016 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1019 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1020 master_readable = false;
1022 log_error("read(): %m");
1027 out_buffer_full += (size_t) k;
1030 if (stdout_writable && out_buffer_full > 0) {
1032 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1035 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1036 stdout_writable = false;
1038 log_error("write(): %m");
1044 assert(out_buffer_full >= (size_t) k);
1045 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1046 out_buffer_full -= k;
1054 close_nointr_nofail(ep);
1057 close_nointr_nofail(signal_fd);
1062 int main(int argc, char *argv[]) {
1064 int r = EXIT_FAILURE, k;
1065 char *oldcg = NULL, *newcg = NULL;
1066 char **controller = NULL;
1067 int master = -1, n_fd_passed;
1068 const char *console = NULL;
1069 struct termios saved_attr, raw_attr;
1071 bool saved_attr_valid = false;
1073 int kmsg_socket_pair[2] = { -1, -1 };
1076 log_parse_environment();
1079 r = parse_argv(argc, argv);
1083 if (arg_directory) {
1086 p = path_make_absolute_cwd(arg_directory);
1087 free(arg_directory);
1090 arg_directory = get_current_dir_name();
1092 if (!arg_directory) {
1093 log_error("Failed to determine path");
1097 path_kill_slashes(arg_directory);
1099 if (geteuid() != 0) {
1100 log_error("Need to be root.");
1104 if (sd_booted() <= 0) {
1105 log_error("Not running on a systemd system.");
1109 if (path_equal(arg_directory, "/")) {
1110 log_error("Spawning container on root directory not supported.");
1114 if (is_os_tree(arg_directory) <= 0) {
1115 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1120 n_fd_passed = sd_listen_fds(false);
1121 if (n_fd_passed > 0) {
1122 k = fdset_new_listen_fds(&fds, false);
1124 log_error("Failed to collect file descriptors: %s", strerror(-k));
1128 fdset_close_others(fds);
1131 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1133 log_error("Failed to determine current cgroup: %s", strerror(-k));
1137 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1138 log_error("Failed to allocate cgroup path.");
1142 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1144 log_error("Failed to create cgroup: %s", strerror(-k));
1148 STRV_FOREACH(controller, arg_controllers) {
1149 k = cg_create_and_attach(*controller, newcg, 0);
1151 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1154 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1156 log_error("Failed to acquire pseudo tty: %m");
1160 console = ptsname(master);
1162 log_error("Failed to determine tty name: %m");
1166 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1168 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1169 ioctl(master, TIOCSWINSZ, &ws);
1171 if (unlockpt(master) < 0) {
1172 log_error("Failed to unlock tty: %m");
1176 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1177 saved_attr_valid = true;
1179 raw_attr = saved_attr;
1180 cfmakeraw(&raw_attr);
1181 raw_attr.c_lflag &= ~ECHO;
1184 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1185 log_error("Failed to create kmsg socket pair");
1189 assert_se(sigemptyset(&mask) == 0);
1190 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1191 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1196 if (saved_attr_valid) {
1197 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1198 log_error("Failed to set terminal attributes: %m");
1203 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1205 if (errno == EINVAL)
1206 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1208 log_error("clone() failed: %m");
1216 const char *home = NULL;
1217 uid_t uid = (uid_t) -1;
1218 gid_t gid = (gid_t) -1;
1220 const char *envp[] = {
1221 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1222 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1227 NULL, /* container_uuid */
1228 NULL, /* LISTEN_FDS */
1229 NULL, /* LISTEN_PID */
1233 envp[2] = strv_find_prefix(environ, "TERM=");
1236 close_nointr_nofail(master);
1239 close_nointr(STDIN_FILENO);
1240 close_nointr(STDOUT_FILENO);
1241 close_nointr(STDERR_FILENO);
1243 close_nointr_nofail(kmsg_socket_pair[0]);
1244 kmsg_socket_pair[0] = -1;
1246 reset_all_signal_handlers();
1248 assert_se(sigemptyset(&mask) == 0);
1249 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1251 k = open_terminal(console, O_RDWR);
1252 if (k != STDIN_FILENO) {
1254 close_nointr_nofail(k);
1258 log_error("Failed to open console: %s", strerror(-k));
1262 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1263 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1264 log_error("Failed to duplicate console: %m");
1269 log_error("setsid() failed: %m");
1273 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1274 log_error("PR_SET_PDEATHSIG failed: %m");
1278 /* Mark everything as slave, so that we still
1279 * receive mounts from the real root, but don't
1280 * propagate mounts to the real root. */
1281 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1282 log_error("MS_SLAVE|MS_REC failed: %m");
1286 /* Turn directory into bind mount */
1287 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1288 log_error("Failed to make bind mount.");
1293 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1294 log_error("Failed to make read-only.");
1298 if (mount_all(arg_directory) < 0)
1301 if (copy_devnodes(arg_directory) < 0)
1304 dev_setup(arg_directory);
1306 if (setup_dev_console(arg_directory, console) < 0)
1309 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1312 close_nointr_nofail(kmsg_socket_pair[1]);
1313 kmsg_socket_pair[1] = -1;
1315 if (setup_boot_id(arg_directory) < 0)
1318 if (setup_timezone(arg_directory) < 0)
1321 if (setup_resolv_conf(arg_directory) < 0)
1324 if (setup_journal(arg_directory) < 0)
1327 if (chdir(arg_directory) < 0) {
1328 log_error("chdir(%s) failed: %m", arg_directory);
1332 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1333 log_error("mount(MS_MOVE) failed: %m");
1337 if (chroot(".") < 0) {
1338 log_error("chroot() failed: %m");
1342 if (chdir("/") < 0) {
1343 log_error("chdir() failed: %m");
1351 if (drop_capabilities() < 0) {
1352 log_error("drop_capabilities() failed: %m");
1358 /* Note that this resolves user names
1359 * inside the container, and hence
1360 * accesses the NSS modules from the
1361 * container and not the host. This is
1364 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1365 log_error("get_user_creds() failed: %m");
1369 if (mkdir_parents_label(home, 0775) < 0) {
1370 log_error("mkdir_parents_label() failed: %m");
1374 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1375 log_error("mkdir_safe_label() failed: %m");
1379 if (initgroups((const char*)arg_user, gid) < 0) {
1380 log_error("initgroups() failed: %m");
1384 if (setresgid(gid, gid, gid) < 0) {
1385 log_error("setregid() failed: %m");
1389 if (setresuid(uid, uid, uid) < 0) {
1390 log_error("setreuid() failed: %m");
1394 /* Reset everything fully to 0, just in case */
1396 if (setgroups(0, NULL) < 0) {
1397 log_error("setgroups() failed: %m");
1401 if (setresgid(0, 0, 0) < 0) {
1402 log_error("setregid() failed: %m");
1406 if (setresuid(0, 0, 0) < 0) {
1407 log_error("setreuid() failed: %m");
1412 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1413 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1414 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1420 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1426 if (fdset_size(fds) > 0) {
1427 k = fdset_cloexec(fds, false);
1429 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1433 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1434 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1446 /* Automatically search for the init system */
1448 l = 1 + argc - optind;
1449 a = newa(char*, l + 1);
1450 memcpy(a + 1, argv + optind, l * sizeof(char*));
1452 a[0] = (char*) "/usr/lib/systemd/systemd";
1453 execve(a[0], a, (char**) envp);
1455 a[0] = (char*) "/lib/systemd/systemd";
1456 execve(a[0], a, (char**) envp);
1458 a[0] = (char*) "/sbin/init";
1459 execve(a[0], a, (char**) envp);
1460 } else if (argc > optind)
1461 execvpe(argv[optind], argv + optind, (char**) envp);
1463 chdir(home ? home : "/root");
1464 execle("/bin/bash", "-bash", NULL, (char**) envp);
1467 log_error("execv() failed: %m");
1470 _exit(EXIT_FAILURE);
1476 if (process_pty(master, pid, &mask) < 0)
1479 if (saved_attr_valid)
1480 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1482 r = wait_for_terminate(pid, &status);
1488 if (status.si_code == CLD_EXITED) {
1489 if (status.si_status != 0) {
1490 log_error("Container failed with error code %i.", status.si_status);
1491 r = status.si_status;
1495 log_debug("Container exited successfully.");
1497 } else if (status.si_code == CLD_KILLED &&
1498 status.si_status == SIGINT) {
1499 log_info("Container has been shut down.");
1502 } else if (status.si_code == CLD_KILLED &&
1503 status.si_status == SIGHUP) {
1504 log_info("Container is being rebooted.");
1506 } else if (status.si_code == CLD_KILLED ||
1507 status.si_code == CLD_DUMPED) {
1509 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1513 log_error("Container failed due to unknown reason.");
1520 if (saved_attr_valid)
1521 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1524 close_nointr_nofail(master);
1526 close_pipe(kmsg_socket_pair);
1529 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1532 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1534 free(arg_directory);
1535 strv_free(arg_controllers);