1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
57 typedef enum LinkJournal {
64 static char *arg_directory = NULL;
65 static char *arg_user = NULL;
66 static char **arg_controllers = NULL;
67 static char *arg_uuid = NULL;
68 static bool arg_private_network = false;
69 static bool arg_read_only = false;
70 static bool arg_boot = false;
71 static LinkJournal arg_link_journal = LINK_AUTO;
72 static uint64_t arg_retain =
74 (1ULL << CAP_DAC_OVERRIDE) |
75 (1ULL << CAP_DAC_READ_SEARCH) |
76 (1ULL << CAP_FOWNER) |
77 (1ULL << CAP_FSETID) |
78 (1ULL << CAP_IPC_OWNER) |
81 (1ULL << CAP_LINUX_IMMUTABLE) |
82 (1ULL << CAP_NET_BIND_SERVICE) |
83 (1ULL << CAP_NET_BROADCAST) |
84 (1ULL << CAP_NET_RAW) |
85 (1ULL << CAP_SETGID) |
86 (1ULL << CAP_SETFCAP) |
87 (1ULL << CAP_SETPCAP) |
88 (1ULL << CAP_SETUID) |
89 (1ULL << CAP_SYS_ADMIN) |
90 (1ULL << CAP_SYS_CHROOT) |
91 (1ULL << CAP_SYS_NICE) |
92 (1ULL << CAP_SYS_PTRACE) |
93 (1ULL << CAP_SYS_TTY_CONFIG) |
94 (1ULL << CAP_SYS_RESOURCE);
96 static int help(void) {
98 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
99 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
100 " -h --help Show this help\n"
101 " -D --directory=NAME Root directory for the container\n"
102 " -b --boot Boot up full system (i.e. invoke init)\n"
103 " -u --user=USER Run the command under specified user or uid\n"
104 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
105 " --uuid=UUID Set a specific machine UUID for the container\n"
106 " --private-network Disable network in container\n"
107 " --read-only Mount the root directory read-only\n"
108 " --capability=CAP In addition to the default, retain specified capability\n"
109 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
110 " -j Equivalent to --link-journal=host\n",
111 program_invocation_short_name);
116 static int parse_argv(int argc, char *argv[]) {
119 ARG_PRIVATE_NETWORK = 0x100,
126 static const struct option options[] = {
127 { "help", no_argument, NULL, 'h' },
128 { "directory", required_argument, NULL, 'D' },
129 { "user", required_argument, NULL, 'u' },
130 { "controllers", required_argument, NULL, 'C' },
131 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
132 { "boot", no_argument, NULL, 'b' },
133 { "uuid", required_argument, NULL, ARG_UUID },
134 { "read-only", no_argument, NULL, ARG_READ_ONLY },
135 { "capability", required_argument, NULL, ARG_CAPABILITY },
136 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
145 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
155 arg_directory = canonicalize_file_name(optarg);
156 if (!arg_directory) {
157 log_error("Failed to canonicalize root directory.");
165 if (!(arg_user = strdup(optarg))) {
166 log_error("Failed to duplicate user name.");
173 strv_free(arg_controllers);
174 arg_controllers = strv_split(optarg, ",");
175 if (!arg_controllers) {
176 log_error("Failed to split controllers list.");
179 strv_uniq(arg_controllers);
183 case ARG_PRIVATE_NETWORK:
184 arg_private_network = true;
196 arg_read_only = true;
199 case ARG_CAPABILITY: {
203 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
207 t = strndup(word, length);
211 if (cap_from_name(t, &cap) < 0) {
212 log_error("Failed to parse capability %s.", t);
218 arg_retain |= 1ULL << (uint64_t) cap;
225 arg_link_journal = LINK_GUEST;
228 case ARG_LINK_JOURNAL:
229 if (streq(optarg, "auto"))
230 arg_link_journal = LINK_AUTO;
231 else if (streq(optarg, "no"))
232 arg_link_journal = LINK_NO;
233 else if (streq(optarg, "guest"))
234 arg_link_journal = LINK_GUEST;
235 else if (streq(optarg, "host"))
236 arg_link_journal = LINK_HOST;
238 log_error("Failed to parse link journal mode %s", optarg);
248 log_error("Unknown option code %c", c);
256 static int mount_all(const char *dest) {
258 typedef struct MountPoint {
267 static const MountPoint mount_table[] = {
268 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
269 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
270 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
271 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
272 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
273 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
274 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
275 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
277 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
278 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
286 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
298 t = path_is_mount_point(where, false);
300 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
309 mkdir_p_label(where, 0755);
311 if (mount(mount_table[k].what,
314 mount_table[k].flags,
315 mount_table[k].options) < 0 &&
316 mount_table[k].fatal) {
318 log_error("mount(%s) failed: %m", where);
330 static int setup_timezone(const char *dest) {
335 /* Fix the timezone, if possible */
336 if (asprintf(&where, "%s/etc/localtime", dest) < 0)
339 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
340 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
344 if (asprintf(&where, "%s/etc/timezone", dest) < 0)
347 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
348 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355 static int setup_resolv_conf(const char *dest) {
360 if (arg_private_network)
363 /* Fix resolv.conf, if possible */
364 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
368 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
369 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
376 static int copy_devnodes(const char *dest) {
378 static const char devnodes[] =
396 NULSTR_FOREACH(d, devnodes) {
398 char *from = NULL, *to = NULL;
400 asprintf(&from, "/dev/%s", d);
401 asprintf(&to, "%s/dev/%s", dest, d);
404 log_error("Failed to allocate devnode path");
417 if (stat(from, &st) < 0) {
419 if (errno != ENOENT) {
420 log_error("Failed to stat %s: %m", from);
425 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
427 log_error("%s is not a char or block device, cannot copy.", from);
431 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
433 log_error("mknod(%s) failed: %m", dest);
447 static int setup_dev_console(const char *dest, const char *console) {
458 if (stat(console, &st) < 0) {
459 log_error("Failed to stat %s: %m", console);
463 } else if (!S_ISCHR(st.st_mode)) {
464 log_error("/dev/console is not a char device.");
469 r = chmod_and_chown(console, 0600, 0, 0);
471 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
475 if (asprintf(&to, "%s/dev/console", dest) < 0) {
480 /* We need to bind mount the right tty to /dev/console since
481 * ptys can only exist on pts file systems. To have something
482 * to bind mount things on we create a device node first, that
483 * has the right major/minor (note that the major minor
484 * doesn't actually matter here, since we mount it over
487 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
488 log_error("mknod() for /dev/console failed: %m");
493 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
494 log_error("Bind mount for /dev/console failed: %m");
506 static int setup_kmsg(const char *dest, int kmsg_socket) {
507 char *from = NULL, *to = NULL;
511 struct cmsghdr cmsghdr;
512 uint8_t buf[CMSG_SPACE(sizeof(int))];
515 struct cmsghdr *cmsg;
518 assert(kmsg_socket >= 0);
522 /* We create the kmsg FIFO as /dev/kmsg, but immediately
523 * delete it after bind mounting it to /proc/kmsg. While FIFOs
524 * on the reading side behave very similar to /proc/kmsg,
525 * their writing side behaves differently from /dev/kmsg in
526 * that writing blocks when nothing is reading. In order to
527 * avoid any problems with containers deadlocking due to this
528 * we simply make /dev/kmsg unavailable to the container. */
529 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
534 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
539 if (mkfifo(from, 0600) < 0) {
540 log_error("mkfifo() for /dev/kmsg failed: %m");
545 r = chmod_and_chown(from, 0600, 0, 0);
547 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
551 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
552 log_error("Bind mount for /proc/kmsg failed: %m");
557 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
559 log_error("Failed to open fifo: %m");
567 mh.msg_control = &control;
568 mh.msg_controllen = sizeof(control);
570 cmsg = CMSG_FIRSTHDR(&mh);
571 cmsg->cmsg_level = SOL_SOCKET;
572 cmsg->cmsg_type = SCM_RIGHTS;
573 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
574 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
576 mh.msg_controllen = cmsg->cmsg_len;
578 /* Store away the fd in the socket, so that it stays open as
579 * long as we run the child */
580 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
581 close_nointr_nofail(fd);
584 log_error("Failed to send FIFO fd: %m");
589 /* And now make the FIFO unavailable as /dev/kmsg... */
600 static int setup_hostname(void) {
604 hn = path_get_file_name(arg_directory);
610 hostname_cleanup(hn);
613 if (sethostname(hn, strlen(hn)) < 0)
622 static int setup_journal(const char *directory) {
623 sd_id128_t machine_id;
624 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
627 if (arg_link_journal == LINK_NO)
630 p = strappend(directory, "/etc/machine-id");
636 r = read_one_line_file(p, &b);
637 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
641 log_error("Failed to read machine ID: %s", strerror(-r));
646 if (isempty(l) && arg_link_journal == LINK_AUTO) {
651 /* Verify validaty */
652 r = sd_id128_from_string(l, &machine_id);
654 log_error("Failed to parse machine ID: %s", strerror(-r));
659 p = strappend("/var/log/journal/", l);
660 q = strjoin(directory, "/var/log/journal/", l, NULL);
666 if (path_is_mount_point(p, false) > 0 ||
667 path_is_mount_point(q, false) > 0) {
668 if (arg_link_journal != LINK_AUTO) {
669 log_error("Journal already a mount point, refusing.");
678 r = readlink_and_make_absolute(p, &d);
680 if ((arg_link_journal == LINK_GUEST ||
681 arg_link_journal == LINK_AUTO) &&
691 log_error("Failed to remove symlink %s: %m", p);
695 } else if (r == -EINVAL) {
697 if (arg_link_journal == LINK_GUEST &&
700 if (errno == ENOTDIR)
701 log_error("%s already exists and is neither symlink nor directory.", p);
703 log_error("Failed to remove %s: %m", p);
709 } else if (r != -ENOENT) {
710 log_error("readlink(%s) failed: %m", p);
714 if (arg_link_journal == LINK_GUEST) {
716 if (symlink(q, p) < 0) {
717 log_error("Failed to symlink %s to %s: %m", q, p);
728 if (arg_link_journal == LINK_HOST) {
729 r = mkdir_p(p, 0755);
731 log_error("Failed to create %s: %m", p);
735 } else if (access(p, F_OK) < 0) {
740 if (dir_is_empty(q) == 0) {
741 log_error("%s not empty.", q);
746 r = mkdir_p(q, 0755);
748 log_error("Failed to create %s: %m", q);
752 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
753 log_error("Failed to bind mount journal from host into guest: %m");
769 static int drop_capabilities(void) {
770 return capability_bounding_set_drop(~arg_retain, false);
773 static int is_os_tree(const char *path) {
776 /* We use /bin/sh as flag file if something is an OS */
778 if (asprintf(&p, "%s/bin/sh", path) < 0)
784 return r < 0 ? 0 : 1;
787 static int process_pty(int master, sigset_t *mask) {
789 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
790 size_t in_buffer_full = 0, out_buffer_full = 0;
791 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
792 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
793 int ep = -1, signal_fd = -1, r;
795 fd_nonblock(STDIN_FILENO, 1);
796 fd_nonblock(STDOUT_FILENO, 1);
797 fd_nonblock(master, 1);
799 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
801 log_error("signalfd(): %m");
806 ep = epoll_create1(EPOLL_CLOEXEC);
808 log_error("Failed to create epoll: %m");
814 stdin_ev.events = EPOLLIN|EPOLLET;
815 stdin_ev.data.fd = STDIN_FILENO;
818 stdout_ev.events = EPOLLOUT|EPOLLET;
819 stdout_ev.data.fd = STDOUT_FILENO;
822 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
823 master_ev.data.fd = master;
826 signal_ev.events = EPOLLIN;
827 signal_ev.data.fd = signal_fd;
829 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
830 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
831 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
832 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
833 log_error("Failed to regiser fds in epoll: %m");
839 struct epoll_event ev[16];
843 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
846 if (errno == EINTR || errno == EAGAIN)
849 log_error("epoll_wait(): %m");
856 for (i = 0; i < nfds; i++) {
857 if (ev[i].data.fd == STDIN_FILENO) {
859 if (ev[i].events & (EPOLLIN|EPOLLHUP))
860 stdin_readable = true;
862 } else if (ev[i].data.fd == STDOUT_FILENO) {
864 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
865 stdout_writable = true;
867 } else if (ev[i].data.fd == master) {
869 if (ev[i].events & (EPOLLIN|EPOLLHUP))
870 master_readable = true;
872 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
873 master_writable = true;
875 } else if (ev[i].data.fd == signal_fd) {
876 struct signalfd_siginfo sfsi;
879 n = read(signal_fd, &sfsi, sizeof(sfsi));
880 if (n != sizeof(sfsi)) {
883 log_error("Failed to read from signalfd: invalid block size");
888 if (errno != EINTR && errno != EAGAIN) {
889 log_error("Failed to read from signalfd: %m");
895 if (sfsi.ssi_signo == SIGWINCH) {
898 /* The window size changed, let's forward that. */
899 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
900 ioctl(master, TIOCSWINSZ, &ws);
909 while ((stdin_readable && in_buffer_full <= 0) ||
910 (master_writable && in_buffer_full > 0) ||
911 (master_readable && out_buffer_full <= 0) ||
912 (stdout_writable && out_buffer_full > 0)) {
914 if (stdin_readable && in_buffer_full < LINE_MAX) {
916 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
919 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
920 stdin_readable = false;
922 log_error("read(): %m");
927 in_buffer_full += (size_t) k;
930 if (master_writable && in_buffer_full > 0) {
932 k = write(master, in_buffer, in_buffer_full);
935 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
936 master_writable = false;
938 log_error("write(): %m");
944 assert(in_buffer_full >= (size_t) k);
945 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
950 if (master_readable && out_buffer_full < LINE_MAX) {
952 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
955 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
956 master_readable = false;
958 log_error("read(): %m");
963 out_buffer_full += (size_t) k;
966 if (stdout_writable && out_buffer_full > 0) {
968 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
971 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
972 stdout_writable = false;
974 log_error("write(): %m");
980 assert(out_buffer_full >= (size_t) k);
981 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
982 out_buffer_full -= k;
990 close_nointr_nofail(ep);
993 close_nointr_nofail(signal_fd);
998 int main(int argc, char *argv[]) {
1000 int r = EXIT_FAILURE, k;
1001 char *oldcg = NULL, *newcg = NULL;
1002 char **controller = NULL;
1004 const char *console = NULL;
1005 struct termios saved_attr, raw_attr;
1007 bool saved_attr_valid = false;
1009 int kmsg_socket_pair[2] = { -1, -1 };
1011 log_parse_environment();
1014 r = parse_argv(argc, argv);
1018 if (arg_directory) {
1021 p = path_make_absolute_cwd(arg_directory);
1022 free(arg_directory);
1025 arg_directory = get_current_dir_name();
1027 if (!arg_directory) {
1028 log_error("Failed to determine path");
1032 path_kill_slashes(arg_directory);
1034 if (geteuid() != 0) {
1035 log_error("Need to be root.");
1039 if (sd_booted() <= 0) {
1040 log_error("Not running on a systemd system.");
1044 if (path_equal(arg_directory, "/")) {
1045 log_error("Spawning container on root directory not supported.");
1049 if (is_os_tree(arg_directory) <= 0) {
1050 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1054 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1056 log_error("Failed to determine current cgroup: %s", strerror(-k));
1060 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1061 log_error("Failed to allocate cgroup path.");
1065 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1067 log_error("Failed to create cgroup: %s", strerror(-k));
1071 STRV_FOREACH(controller, arg_controllers) {
1072 k = cg_create_and_attach(*controller, newcg, 0);
1074 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1077 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1079 log_error("Failed to acquire pseudo tty: %m");
1083 console = ptsname(master);
1085 log_error("Failed to determine tty name: %m");
1089 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1091 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1092 ioctl(master, TIOCSWINSZ, &ws);
1094 if (unlockpt(master) < 0) {
1095 log_error("Failed to unlock tty: %m");
1099 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1100 log_error("Failed to get terminal attributes: %m");
1104 saved_attr_valid = true;
1106 raw_attr = saved_attr;
1107 cfmakeraw(&raw_attr);
1108 raw_attr.c_lflag &= ~ECHO;
1110 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1111 log_error("Failed to set terminal attributes: %m");
1115 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1116 log_error("Failed to create kmsg socket pair");
1120 assert_se(sigemptyset(&mask) == 0);
1121 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1122 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1124 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1126 if (errno == EINVAL)
1127 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1129 log_error("clone() failed: %m");
1137 const char *home = NULL;
1138 uid_t uid = (uid_t) -1;
1139 gid_t gid = (gid_t) -1;
1140 const char *envp[] = {
1141 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1142 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1147 NULL, /* container_uuid */
1151 envp[2] = strv_find_prefix(environ, "TERM=");
1153 close_nointr_nofail(master);
1155 close_nointr(STDIN_FILENO);
1156 close_nointr(STDOUT_FILENO);
1157 close_nointr(STDERR_FILENO);
1159 close_all_fds(&kmsg_socket_pair[1], 1);
1161 reset_all_signal_handlers();
1163 assert_se(sigemptyset(&mask) == 0);
1164 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1166 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1167 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1168 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1172 log_error("setsid() failed: %m");
1176 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1177 log_error("PR_SET_PDEATHSIG failed: %m");
1181 /* Mark / as private, in case somebody marked it shared */
1182 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0) {
1183 log_error("MS_PRIVATE|MS_REC failed: %m");
1187 /* Turn directory into bind mount */
1188 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
1189 log_error("Failed to make bind mount.");
1194 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
1195 log_error("Failed to make read-only.");
1199 if (mount_all(arg_directory) < 0)
1202 if (copy_devnodes(arg_directory) < 0)
1205 if (setup_dev_console(arg_directory, console) < 0)
1208 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1211 close_nointr_nofail(kmsg_socket_pair[1]);
1213 if (setup_timezone(arg_directory) < 0)
1216 if (setup_resolv_conf(arg_directory) < 0)
1219 if (setup_journal(arg_directory) < 0)
1222 if (chdir(arg_directory) < 0) {
1223 log_error("chdir(%s) failed: %m", arg_directory);
1227 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1228 log_error("mount(MS_BIND) failed: %m");
1232 if (chroot(".") < 0) {
1233 log_error("chroot() failed: %m");
1237 if (chdir("/") < 0) {
1238 log_error("chdir() failed: %m");
1246 if (drop_capabilities() < 0) {
1247 log_error("drop_capabilities() failed: %m");
1253 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1254 log_error("get_user_creds() failed: %m");
1258 if (mkdir_parents_label(home, 0775) < 0) {
1259 log_error("mkdir_parents_label() failed: %m");
1263 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1264 log_error("mkdir_safe_label() failed: %m");
1268 if (initgroups((const char*)arg_user, gid) < 0) {
1269 log_error("initgroups() failed: %m");
1273 if (setresgid(gid, gid, gid) < 0) {
1274 log_error("setregid() failed: %m");
1278 if (setresuid(uid, uid, uid) < 0) {
1279 log_error("setreuid() failed: %m");
1284 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1285 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1286 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1292 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1304 /* Automatically search for the init system */
1306 l = 1 + argc - optind;
1307 a = newa(char*, l + 1);
1308 memcpy(a + 1, argv + optind, l * sizeof(char*));
1310 a[0] = (char*) "/usr/lib/systemd/systemd";
1311 execve(a[0], a, (char**) envp);
1313 a[0] = (char*) "/lib/systemd/systemd";
1314 execve(a[0], a, (char**) envp);
1316 a[0] = (char*) "/sbin/init";
1317 execve(a[0], a, (char**) envp);
1318 } else if (argc > optind)
1319 execvpe(argv[optind], argv + optind, (char**) envp);
1321 chdir(home ? home : "/root");
1322 execle("/bin/bash", "-bash", NULL, (char**) envp);
1325 log_error("execv() failed: %m");
1328 _exit(EXIT_FAILURE);
1331 if (process_pty(master, &mask) < 0)
1334 if (saved_attr_valid) {
1335 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1336 saved_attr_valid = false;
1339 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1345 if (saved_attr_valid)
1346 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1349 close_nointr_nofail(master);
1351 close_pipe(kmsg_socket_pair);
1354 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1357 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1359 free(arg_directory);
1360 strv_free(arg_controllers);