1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
56 #include "dev-setup.h"
58 typedef enum LinkJournal {
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
75 (1ULL << CAP_DAC_OVERRIDE) |
76 (1ULL << CAP_DAC_READ_SEARCH) |
77 (1ULL << CAP_FOWNER) |
78 (1ULL << CAP_FSETID) |
79 (1ULL << CAP_IPC_OWNER) |
82 (1ULL << CAP_LINUX_IMMUTABLE) |
83 (1ULL << CAP_NET_BIND_SERVICE) |
84 (1ULL << CAP_NET_BROADCAST) |
85 (1ULL << CAP_NET_RAW) |
86 (1ULL << CAP_SETGID) |
87 (1ULL << CAP_SETFCAP) |
88 (1ULL << CAP_SETPCAP) |
89 (1ULL << CAP_SETUID) |
90 (1ULL << CAP_SYS_ADMIN) |
91 (1ULL << CAP_SYS_CHROOT) |
92 (1ULL << CAP_SYS_NICE) |
93 (1ULL << CAP_SYS_PTRACE) |
94 (1ULL << CAP_SYS_TTY_CONFIG) |
95 (1ULL << CAP_SYS_RESOURCE);
97 static int help(void) {
99 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101 " -h --help Show this help\n"
102 " -D --directory=NAME Root directory for the container\n"
103 " -b --boot Boot up full system (i.e. invoke init)\n"
104 " -u --user=USER Run the command under specified user or uid\n"
105 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
106 " --uuid=UUID Set a specific machine UUID for the container\n"
107 " --private-network Disable network in container\n"
108 " --read-only Mount the root directory read-only\n"
109 " --capability=CAP In addition to the default, retain specified capability\n"
110 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
111 " -j Equivalent to --link-journal=host\n",
112 program_invocation_short_name);
117 static int parse_argv(int argc, char *argv[]) {
120 ARG_PRIVATE_NETWORK = 0x100,
127 static const struct option options[] = {
128 { "help", no_argument, NULL, 'h' },
129 { "directory", required_argument, NULL, 'D' },
130 { "user", required_argument, NULL, 'u' },
131 { "controllers", required_argument, NULL, 'C' },
132 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
133 { "boot", no_argument, NULL, 'b' },
134 { "uuid", required_argument, NULL, ARG_UUID },
135 { "read-only", no_argument, NULL, ARG_READ_ONLY },
136 { "capability", required_argument, NULL, ARG_CAPABILITY },
137 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
146 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
156 arg_directory = canonicalize_file_name(optarg);
157 if (!arg_directory) {
158 log_error("Failed to canonicalize root directory.");
166 if (!(arg_user = strdup(optarg))) {
167 log_error("Failed to duplicate user name.");
174 strv_free(arg_controllers);
175 arg_controllers = strv_split(optarg, ",");
176 if (!arg_controllers) {
177 log_error("Failed to split controllers list.");
180 strv_uniq(arg_controllers);
184 case ARG_PRIVATE_NETWORK:
185 arg_private_network = true;
197 arg_read_only = true;
200 case ARG_CAPABILITY: {
204 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
208 t = strndup(word, length);
212 if (cap_from_name(t, &cap) < 0) {
213 log_error("Failed to parse capability %s.", t);
219 arg_retain |= 1ULL << (uint64_t) cap;
226 arg_link_journal = LINK_GUEST;
229 case ARG_LINK_JOURNAL:
230 if (streq(optarg, "auto"))
231 arg_link_journal = LINK_AUTO;
232 else if (streq(optarg, "no"))
233 arg_link_journal = LINK_NO;
234 else if (streq(optarg, "guest"))
235 arg_link_journal = LINK_GUEST;
236 else if (streq(optarg, "host"))
237 arg_link_journal = LINK_HOST;
239 log_error("Failed to parse link journal mode %s", optarg);
249 log_error("Unknown option code %c", c);
257 static int mount_all(const char *dest) {
259 typedef struct MountPoint {
268 static const MountPoint mount_table[] = {
269 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
270 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
271 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
272 { "/sys", "/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
275 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
276 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
279 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
287 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
290 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
299 t = path_is_mount_point(where, false);
301 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
310 mkdir_p_label(where, 0755);
312 if (mount(mount_table[k].what,
315 mount_table[k].flags,
316 mount_table[k].options) < 0 &&
317 mount_table[k].fatal) {
319 log_error("mount(%s) failed: %m", where);
331 static int setup_timezone(const char *dest) {
336 /* Fix the timezone, if possible */
337 if (asprintf(&where, "%s/etc/localtime", dest) < 0)
340 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
341 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
345 if (asprintf(&where, "%s/etc/timezone", dest) < 0)
348 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
349 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
356 static int setup_resolv_conf(const char *dest) {
361 if (arg_private_network)
364 /* Fix resolv.conf, if possible */
365 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
369 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
370 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
377 static int copy_devnodes(const char *dest) {
379 static const char devnodes[] =
397 NULSTR_FOREACH(d, devnodes) {
399 char *from = NULL, *to = NULL;
401 asprintf(&from, "/dev/%s", d);
402 asprintf(&to, "%s/dev/%s", dest, d);
405 log_error("Failed to allocate devnode path");
418 if (stat(from, &st) < 0) {
420 if (errno != ENOENT) {
421 log_error("Failed to stat %s: %m", from);
426 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
428 log_error("%s is not a char or block device, cannot copy.", from);
432 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
434 log_error("mknod(%s) failed: %m", dest);
448 static int setup_dev_console(const char *dest, const char *console) {
459 if (stat(console, &st) < 0) {
460 log_error("Failed to stat %s: %m", console);
464 } else if (!S_ISCHR(st.st_mode)) {
465 log_error("/dev/console is not a char device.");
470 r = chmod_and_chown(console, 0600, 0, 0);
472 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
476 if (asprintf(&to, "%s/dev/console", dest) < 0) {
481 /* We need to bind mount the right tty to /dev/console since
482 * ptys can only exist on pts file systems. To have something
483 * to bind mount things on we create a device node first, that
484 * has the right major/minor (note that the major minor
485 * doesn't actually matter here, since we mount it over
488 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
489 log_error("mknod() for /dev/console failed: %m");
494 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
495 log_error("Bind mount for /dev/console failed: %m");
507 static int setup_kmsg(const char *dest, int kmsg_socket) {
508 char *from = NULL, *to = NULL;
512 struct cmsghdr cmsghdr;
513 uint8_t buf[CMSG_SPACE(sizeof(int))];
516 struct cmsghdr *cmsg;
519 assert(kmsg_socket >= 0);
523 /* We create the kmsg FIFO as /dev/kmsg, but immediately
524 * delete it after bind mounting it to /proc/kmsg. While FIFOs
525 * on the reading side behave very similar to /proc/kmsg,
526 * their writing side behaves differently from /dev/kmsg in
527 * that writing blocks when nothing is reading. In order to
528 * avoid any problems with containers deadlocking due to this
529 * we simply make /dev/kmsg unavailable to the container. */
530 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
535 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
540 if (mkfifo(from, 0600) < 0) {
541 log_error("mkfifo() for /dev/kmsg failed: %m");
546 r = chmod_and_chown(from, 0600, 0, 0);
548 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
552 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
553 log_error("Bind mount for /proc/kmsg failed: %m");
558 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
560 log_error("Failed to open fifo: %m");
568 mh.msg_control = &control;
569 mh.msg_controllen = sizeof(control);
571 cmsg = CMSG_FIRSTHDR(&mh);
572 cmsg->cmsg_level = SOL_SOCKET;
573 cmsg->cmsg_type = SCM_RIGHTS;
574 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
575 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
577 mh.msg_controllen = cmsg->cmsg_len;
579 /* Store away the fd in the socket, so that it stays open as
580 * long as we run the child */
581 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
582 close_nointr_nofail(fd);
585 log_error("Failed to send FIFO fd: %m");
590 /* And now make the FIFO unavailable as /dev/kmsg... */
601 static int setup_hostname(void) {
605 hn = path_get_file_name(arg_directory);
611 hostname_cleanup(hn);
614 if (sethostname(hn, strlen(hn)) < 0)
623 static int setup_journal(const char *directory) {
624 sd_id128_t machine_id;
625 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
628 if (arg_link_journal == LINK_NO)
631 p = strappend(directory, "/etc/machine-id");
637 r = read_one_line_file(p, &b);
638 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
642 log_error("Failed to read machine ID: %s", strerror(-r));
647 if (isempty(l) && arg_link_journal == LINK_AUTO) {
652 /* Verify validaty */
653 r = sd_id128_from_string(l, &machine_id);
655 log_error("Failed to parse machine ID: %s", strerror(-r));
660 p = strappend("/var/log/journal/", l);
661 q = strjoin(directory, "/var/log/journal/", l, NULL);
667 if (path_is_mount_point(p, false) > 0 ||
668 path_is_mount_point(q, false) > 0) {
669 if (arg_link_journal != LINK_AUTO) {
670 log_error("Journal already a mount point, refusing.");
679 r = readlink_and_make_absolute(p, &d);
681 if ((arg_link_journal == LINK_GUEST ||
682 arg_link_journal == LINK_AUTO) &&
692 log_error("Failed to remove symlink %s: %m", p);
696 } else if (r == -EINVAL) {
698 if (arg_link_journal == LINK_GUEST &&
701 if (errno == ENOTDIR)
702 log_error("%s already exists and is neither symlink nor directory.", p);
704 log_error("Failed to remove %s: %m", p);
710 } else if (r != -ENOENT) {
711 log_error("readlink(%s) failed: %m", p);
715 if (arg_link_journal == LINK_GUEST) {
717 if (symlink(q, p) < 0) {
718 log_error("Failed to symlink %s to %s: %m", q, p);
729 if (arg_link_journal == LINK_HOST) {
730 r = mkdir_p(p, 0755);
732 log_error("Failed to create %s: %m", p);
736 } else if (access(p, F_OK) < 0) {
741 if (dir_is_empty(q) == 0) {
742 log_error("%s not empty.", q);
747 r = mkdir_p(q, 0755);
749 log_error("Failed to create %s: %m", q);
753 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
754 log_error("Failed to bind mount journal from host into guest: %m");
770 static int drop_capabilities(void) {
771 return capability_bounding_set_drop(~arg_retain, false);
774 static int is_os_tree(const char *path) {
777 /* We use /bin/sh as flag file if something is an OS */
779 if (asprintf(&p, "%s/bin/sh", path) < 0)
785 return r < 0 ? 0 : 1;
788 static int process_pty(int master, sigset_t *mask) {
790 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
791 size_t in_buffer_full = 0, out_buffer_full = 0;
792 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
793 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
794 int ep = -1, signal_fd = -1, r;
796 fd_nonblock(STDIN_FILENO, 1);
797 fd_nonblock(STDOUT_FILENO, 1);
798 fd_nonblock(master, 1);
800 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
802 log_error("signalfd(): %m");
807 ep = epoll_create1(EPOLL_CLOEXEC);
809 log_error("Failed to create epoll: %m");
815 stdin_ev.events = EPOLLIN|EPOLLET;
816 stdin_ev.data.fd = STDIN_FILENO;
819 stdout_ev.events = EPOLLOUT|EPOLLET;
820 stdout_ev.data.fd = STDOUT_FILENO;
823 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
824 master_ev.data.fd = master;
827 signal_ev.events = EPOLLIN;
828 signal_ev.data.fd = signal_fd;
830 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
831 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
832 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
833 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
834 log_error("Failed to regiser fds in epoll: %m");
840 struct epoll_event ev[16];
844 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
847 if (errno == EINTR || errno == EAGAIN)
850 log_error("epoll_wait(): %m");
857 for (i = 0; i < nfds; i++) {
858 if (ev[i].data.fd == STDIN_FILENO) {
860 if (ev[i].events & (EPOLLIN|EPOLLHUP))
861 stdin_readable = true;
863 } else if (ev[i].data.fd == STDOUT_FILENO) {
865 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
866 stdout_writable = true;
868 } else if (ev[i].data.fd == master) {
870 if (ev[i].events & (EPOLLIN|EPOLLHUP))
871 master_readable = true;
873 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
874 master_writable = true;
876 } else if (ev[i].data.fd == signal_fd) {
877 struct signalfd_siginfo sfsi;
880 n = read(signal_fd, &sfsi, sizeof(sfsi));
881 if (n != sizeof(sfsi)) {
884 log_error("Failed to read from signalfd: invalid block size");
889 if (errno != EINTR && errno != EAGAIN) {
890 log_error("Failed to read from signalfd: %m");
896 if (sfsi.ssi_signo == SIGWINCH) {
899 /* The window size changed, let's forward that. */
900 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
901 ioctl(master, TIOCSWINSZ, &ws);
910 while ((stdin_readable && in_buffer_full <= 0) ||
911 (master_writable && in_buffer_full > 0) ||
912 (master_readable && out_buffer_full <= 0) ||
913 (stdout_writable && out_buffer_full > 0)) {
915 if (stdin_readable && in_buffer_full < LINE_MAX) {
917 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
920 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
921 stdin_readable = false;
923 log_error("read(): %m");
928 in_buffer_full += (size_t) k;
931 if (master_writable && in_buffer_full > 0) {
933 k = write(master, in_buffer, in_buffer_full);
936 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
937 master_writable = false;
939 log_error("write(): %m");
945 assert(in_buffer_full >= (size_t) k);
946 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
951 if (master_readable && out_buffer_full < LINE_MAX) {
953 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
956 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
957 master_readable = false;
959 log_error("read(): %m");
964 out_buffer_full += (size_t) k;
967 if (stdout_writable && out_buffer_full > 0) {
969 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
972 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
973 stdout_writable = false;
975 log_error("write(): %m");
981 assert(out_buffer_full >= (size_t) k);
982 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
983 out_buffer_full -= k;
991 close_nointr_nofail(ep);
994 close_nointr_nofail(signal_fd);
999 int main(int argc, char *argv[]) {
1001 int r = EXIT_FAILURE, k;
1002 char *oldcg = NULL, *newcg = NULL;
1003 char **controller = NULL;
1005 const char *console = NULL;
1006 struct termios saved_attr, raw_attr;
1008 bool saved_attr_valid = false;
1010 int kmsg_socket_pair[2] = { -1, -1 };
1012 log_parse_environment();
1015 r = parse_argv(argc, argv);
1019 if (arg_directory) {
1022 p = path_make_absolute_cwd(arg_directory);
1023 free(arg_directory);
1026 arg_directory = get_current_dir_name();
1028 if (!arg_directory) {
1029 log_error("Failed to determine path");
1033 path_kill_slashes(arg_directory);
1035 if (geteuid() != 0) {
1036 log_error("Need to be root.");
1040 if (sd_booted() <= 0) {
1041 log_error("Not running on a systemd system.");
1045 if (path_equal(arg_directory, "/")) {
1046 log_error("Spawning container on root directory not supported.");
1050 if (is_os_tree(arg_directory) <= 0) {
1051 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1055 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1057 log_error("Failed to determine current cgroup: %s", strerror(-k));
1061 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1062 log_error("Failed to allocate cgroup path.");
1066 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1068 log_error("Failed to create cgroup: %s", strerror(-k));
1072 STRV_FOREACH(controller, arg_controllers) {
1073 k = cg_create_and_attach(*controller, newcg, 0);
1075 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1078 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1080 log_error("Failed to acquire pseudo tty: %m");
1084 console = ptsname(master);
1086 log_error("Failed to determine tty name: %m");
1090 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1092 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1093 ioctl(master, TIOCSWINSZ, &ws);
1095 if (unlockpt(master) < 0) {
1096 log_error("Failed to unlock tty: %m");
1100 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1101 log_error("Failed to get terminal attributes: %m");
1105 saved_attr_valid = true;
1107 raw_attr = saved_attr;
1108 cfmakeraw(&raw_attr);
1109 raw_attr.c_lflag &= ~ECHO;
1111 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1112 log_error("Failed to set terminal attributes: %m");
1116 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1117 log_error("Failed to create kmsg socket pair");
1121 assert_se(sigemptyset(&mask) == 0);
1122 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1123 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1125 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1127 if (errno == EINVAL)
1128 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1130 log_error("clone() failed: %m");
1138 const char *home = NULL;
1139 uid_t uid = (uid_t) -1;
1140 gid_t gid = (gid_t) -1;
1141 const char *envp[] = {
1142 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1143 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1148 NULL, /* container_uuid */
1152 envp[2] = strv_find_prefix(environ, "TERM=");
1154 close_nointr_nofail(master);
1156 close_nointr(STDIN_FILENO);
1157 close_nointr(STDOUT_FILENO);
1158 close_nointr(STDERR_FILENO);
1160 close_all_fds(&kmsg_socket_pair[1], 1);
1162 reset_all_signal_handlers();
1164 assert_se(sigemptyset(&mask) == 0);
1165 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1167 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1168 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1169 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1173 log_error("setsid() failed: %m");
1177 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1178 log_error("PR_SET_PDEATHSIG failed: %m");
1182 /* Mark everything as slave, so that we still
1183 * receive mounts from the real root, but don't
1184 * propagate mounts to the real root. */
1185 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1186 log_error("MS_SLAVE|MS_REC failed: %m");
1190 /* Turn directory into bind mount */
1191 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1192 log_error("Failed to make bind mount.");
1197 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1198 log_error("Failed to make read-only.");
1202 if (mount_all(arg_directory) < 0)
1205 if (copy_devnodes(arg_directory) < 0)
1208 dev_setup(arg_directory);
1210 if (setup_dev_console(arg_directory, console) < 0)
1213 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1216 close_nointr_nofail(kmsg_socket_pair[1]);
1218 if (setup_timezone(arg_directory) < 0)
1221 if (setup_resolv_conf(arg_directory) < 0)
1224 if (setup_journal(arg_directory) < 0)
1227 if (chdir(arg_directory) < 0) {
1228 log_error("chdir(%s) failed: %m", arg_directory);
1232 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1233 log_error("mount(MS_MOVE) failed: %m");
1237 if (chroot(".") < 0) {
1238 log_error("chroot() failed: %m");
1242 if (chdir("/") < 0) {
1243 log_error("chdir() failed: %m");
1251 if (drop_capabilities() < 0) {
1252 log_error("drop_capabilities() failed: %m");
1258 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1259 log_error("get_user_creds() failed: %m");
1263 if (mkdir_parents_label(home, 0775) < 0) {
1264 log_error("mkdir_parents_label() failed: %m");
1268 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1269 log_error("mkdir_safe_label() failed: %m");
1273 if (initgroups((const char*)arg_user, gid) < 0) {
1274 log_error("initgroups() failed: %m");
1278 if (setresgid(gid, gid, gid) < 0) {
1279 log_error("setregid() failed: %m");
1283 if (setresuid(uid, uid, uid) < 0) {
1284 log_error("setreuid() failed: %m");
1289 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1290 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1291 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1297 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1309 /* Automatically search for the init system */
1311 l = 1 + argc - optind;
1312 a = newa(char*, l + 1);
1313 memcpy(a + 1, argv + optind, l * sizeof(char*));
1315 a[0] = (char*) "/usr/lib/systemd/systemd";
1316 execve(a[0], a, (char**) envp);
1318 a[0] = (char*) "/lib/systemd/systemd";
1319 execve(a[0], a, (char**) envp);
1321 a[0] = (char*) "/sbin/init";
1322 execve(a[0], a, (char**) envp);
1323 } else if (argc > optind)
1324 execvpe(argv[optind], argv + optind, (char**) envp);
1326 chdir(home ? home : "/root");
1327 execle("/bin/bash", "-bash", NULL, (char**) envp);
1330 log_error("execv() failed: %m");
1333 _exit(EXIT_FAILURE);
1336 if (process_pty(master, &mask) < 0)
1339 if (saved_attr_valid) {
1340 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1341 saved_attr_valid = false;
1344 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1350 if (saved_attr_valid)
1351 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1354 close_nointr_nofail(master);
1356 close_pipe(kmsg_socket_pair);
1359 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1362 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1364 free(arg_directory);
1365 strv_free(arg_controllers);