1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
56 #include "dev-setup.h"
58 typedef enum LinkJournal {
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
75 (1ULL << CAP_DAC_OVERRIDE) |
76 (1ULL << CAP_DAC_READ_SEARCH) |
77 (1ULL << CAP_FOWNER) |
78 (1ULL << CAP_FSETID) |
79 (1ULL << CAP_IPC_OWNER) |
82 (1ULL << CAP_LINUX_IMMUTABLE) |
83 (1ULL << CAP_NET_BIND_SERVICE) |
84 (1ULL << CAP_NET_BROADCAST) |
85 (1ULL << CAP_NET_RAW) |
86 (1ULL << CAP_SETGID) |
87 (1ULL << CAP_SETFCAP) |
88 (1ULL << CAP_SETPCAP) |
89 (1ULL << CAP_SETUID) |
90 (1ULL << CAP_SYS_ADMIN) |
91 (1ULL << CAP_SYS_CHROOT) |
92 (1ULL << CAP_SYS_NICE) |
93 (1ULL << CAP_SYS_PTRACE) |
94 (1ULL << CAP_SYS_TTY_CONFIG) |
95 (1ULL << CAP_SYS_RESOURCE);
97 static int help(void) {
99 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101 " -h --help Show this help\n"
102 " -D --directory=NAME Root directory for the container\n"
103 " -b --boot Boot up full system (i.e. invoke init)\n"
104 " -u --user=USER Run the command under specified user or uid\n"
105 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
106 " --uuid=UUID Set a specific machine UUID for the container\n"
107 " --private-network Disable network in container\n"
108 " --read-only Mount the root directory read-only\n"
109 " --capability=CAP In addition to the default, retain specified capability\n"
110 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
111 " -j Equivalent to --link-journal=host\n",
112 program_invocation_short_name);
117 static int parse_argv(int argc, char *argv[]) {
120 ARG_PRIVATE_NETWORK = 0x100,
127 static const struct option options[] = {
128 { "help", no_argument, NULL, 'h' },
129 { "directory", required_argument, NULL, 'D' },
130 { "user", required_argument, NULL, 'u' },
131 { "controllers", required_argument, NULL, 'C' },
132 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
133 { "boot", no_argument, NULL, 'b' },
134 { "uuid", required_argument, NULL, ARG_UUID },
135 { "read-only", no_argument, NULL, ARG_READ_ONLY },
136 { "capability", required_argument, NULL, ARG_CAPABILITY },
137 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
146 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
156 arg_directory = canonicalize_file_name(optarg);
157 if (!arg_directory) {
158 log_error("Failed to canonicalize root directory.");
166 if (!(arg_user = strdup(optarg))) {
167 log_error("Failed to duplicate user name.");
174 strv_free(arg_controllers);
175 arg_controllers = strv_split(optarg, ",");
176 if (!arg_controllers) {
177 log_error("Failed to split controllers list.");
180 strv_uniq(arg_controllers);
184 case ARG_PRIVATE_NETWORK:
185 arg_private_network = true;
197 arg_read_only = true;
200 case ARG_CAPABILITY: {
204 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
208 t = strndup(word, length);
212 if (cap_from_name(t, &cap) < 0) {
213 log_error("Failed to parse capability %s.", t);
219 arg_retain |= 1ULL << (uint64_t) cap;
226 arg_link_journal = LINK_GUEST;
229 case ARG_LINK_JOURNAL:
230 if (streq(optarg, "auto"))
231 arg_link_journal = LINK_AUTO;
232 else if (streq(optarg, "no"))
233 arg_link_journal = LINK_NO;
234 else if (streq(optarg, "guest"))
235 arg_link_journal = LINK_GUEST;
236 else if (streq(optarg, "host"))
237 arg_link_journal = LINK_HOST;
239 log_error("Failed to parse link journal mode %s", optarg);
249 log_error("Unknown option code %c", c);
257 static int mount_all(const char *dest) {
259 typedef struct MountPoint {
268 static const MountPoint mount_table[] = {
269 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
270 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
271 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
272 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
273 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
274 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
275 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
277 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
278 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
286 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
298 t = path_is_mount_point(where, true);
300 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
309 /* Skip this entry if it is not a remount. */
310 if (mount_table[k].what && t > 0)
313 mkdir_p_label(where, 0755);
315 if (mount(mount_table[k].what,
318 mount_table[k].flags,
319 mount_table[k].options) < 0 &&
320 mount_table[k].fatal) {
322 log_error("mount(%s) failed: %m", where);
334 static int setup_timezone(const char *dest) {
339 /* Fix the timezone, if possible */
340 if (asprintf(&where, "%s/etc/localtime", dest) < 0)
343 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
344 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
348 if (asprintf(&where, "%s/etc/timezone", dest) < 0)
351 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
352 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
359 static int setup_resolv_conf(const char *dest) {
364 if (arg_private_network)
367 /* Fix resolv.conf, if possible */
368 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
372 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
373 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
380 static int copy_devnodes(const char *dest) {
382 static const char devnodes[] =
400 NULSTR_FOREACH(d, devnodes) {
402 char *from = NULL, *to = NULL;
404 asprintf(&from, "/dev/%s", d);
405 asprintf(&to, "%s/dev/%s", dest, d);
408 log_error("Failed to allocate devnode path");
421 if (stat(from, &st) < 0) {
423 if (errno != ENOENT) {
424 log_error("Failed to stat %s: %m", from);
429 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
431 log_error("%s is not a char or block device, cannot copy.", from);
435 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
437 log_error("mknod(%s) failed: %m", dest);
451 static int setup_dev_console(const char *dest, const char *console) {
462 if (stat(console, &st) < 0) {
463 log_error("Failed to stat %s: %m", console);
467 } else if (!S_ISCHR(st.st_mode)) {
468 log_error("/dev/console is not a char device.");
473 r = chmod_and_chown(console, 0600, 0, 0);
475 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
479 if (asprintf(&to, "%s/dev/console", dest) < 0) {
484 /* We need to bind mount the right tty to /dev/console since
485 * ptys can only exist on pts file systems. To have something
486 * to bind mount things on we create a device node first, that
487 * has the right major/minor (note that the major minor
488 * doesn't actually matter here, since we mount it over
491 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
492 log_error("mknod() for /dev/console failed: %m");
497 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
498 log_error("Bind mount for /dev/console failed: %m");
510 static int setup_kmsg(const char *dest, int kmsg_socket) {
511 char *from = NULL, *to = NULL;
515 struct cmsghdr cmsghdr;
516 uint8_t buf[CMSG_SPACE(sizeof(int))];
519 struct cmsghdr *cmsg;
522 assert(kmsg_socket >= 0);
526 /* We create the kmsg FIFO as /dev/kmsg, but immediately
527 * delete it after bind mounting it to /proc/kmsg. While FIFOs
528 * on the reading side behave very similar to /proc/kmsg,
529 * their writing side behaves differently from /dev/kmsg in
530 * that writing blocks when nothing is reading. In order to
531 * avoid any problems with containers deadlocking due to this
532 * we simply make /dev/kmsg unavailable to the container. */
533 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
538 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
543 if (mkfifo(from, 0600) < 0) {
544 log_error("mkfifo() for /dev/kmsg failed: %m");
549 r = chmod_and_chown(from, 0600, 0, 0);
551 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
555 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
556 log_error("Bind mount for /proc/kmsg failed: %m");
561 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
563 log_error("Failed to open fifo: %m");
571 mh.msg_control = &control;
572 mh.msg_controllen = sizeof(control);
574 cmsg = CMSG_FIRSTHDR(&mh);
575 cmsg->cmsg_level = SOL_SOCKET;
576 cmsg->cmsg_type = SCM_RIGHTS;
577 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
578 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
580 mh.msg_controllen = cmsg->cmsg_len;
582 /* Store away the fd in the socket, so that it stays open as
583 * long as we run the child */
584 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
585 close_nointr_nofail(fd);
588 log_error("Failed to send FIFO fd: %m");
593 /* And now make the FIFO unavailable as /dev/kmsg... */
604 static int setup_hostname(void) {
608 hn = path_get_file_name(arg_directory);
614 hostname_cleanup(hn);
617 if (sethostname(hn, strlen(hn)) < 0)
626 static int setup_journal(const char *directory) {
627 sd_id128_t machine_id;
628 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
631 if (arg_link_journal == LINK_NO)
634 p = strappend(directory, "/etc/machine-id");
640 r = read_one_line_file(p, &b);
641 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
645 log_error("Failed to read machine ID: %s", strerror(-r));
650 if (isempty(l) && arg_link_journal == LINK_AUTO) {
655 /* Verify validaty */
656 r = sd_id128_from_string(l, &machine_id);
658 log_error("Failed to parse machine ID: %s", strerror(-r));
663 p = strappend("/var/log/journal/", l);
664 q = strjoin(directory, "/var/log/journal/", l, NULL);
670 if (path_is_mount_point(p, false) > 0 ||
671 path_is_mount_point(q, false) > 0) {
672 if (arg_link_journal != LINK_AUTO) {
673 log_error("Journal already a mount point, refusing.");
682 r = readlink_and_make_absolute(p, &d);
684 if ((arg_link_journal == LINK_GUEST ||
685 arg_link_journal == LINK_AUTO) &&
695 log_error("Failed to remove symlink %s: %m", p);
699 } else if (r == -EINVAL) {
701 if (arg_link_journal == LINK_GUEST &&
704 if (errno == ENOTDIR)
705 log_error("%s already exists and is neither symlink nor directory.", p);
707 log_error("Failed to remove %s: %m", p);
713 } else if (r != -ENOENT) {
714 log_error("readlink(%s) failed: %m", p);
718 if (arg_link_journal == LINK_GUEST) {
720 if (symlink(q, p) < 0) {
721 log_error("Failed to symlink %s to %s: %m", q, p);
732 if (arg_link_journal == LINK_HOST) {
733 r = mkdir_p(p, 0755);
735 log_error("Failed to create %s: %m", p);
739 } else if (access(p, F_OK) < 0) {
744 if (dir_is_empty(q) == 0) {
745 log_error("%s not empty.", q);
750 r = mkdir_p(q, 0755);
752 log_error("Failed to create %s: %m", q);
756 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
757 log_error("Failed to bind mount journal from host into guest: %m");
773 static int drop_capabilities(void) {
774 return capability_bounding_set_drop(~arg_retain, false);
777 static int is_os_tree(const char *path) {
780 /* We use /bin/sh as flag file if something is an OS */
782 if (asprintf(&p, "%s/bin/sh", path) < 0)
788 return r < 0 ? 0 : 1;
791 static int process_pty(int master, sigset_t *mask) {
793 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
794 size_t in_buffer_full = 0, out_buffer_full = 0;
795 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
796 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
797 int ep = -1, signal_fd = -1, r;
799 fd_nonblock(STDIN_FILENO, 1);
800 fd_nonblock(STDOUT_FILENO, 1);
801 fd_nonblock(master, 1);
803 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
805 log_error("signalfd(): %m");
810 ep = epoll_create1(EPOLL_CLOEXEC);
812 log_error("Failed to create epoll: %m");
818 stdin_ev.events = EPOLLIN|EPOLLET;
819 stdin_ev.data.fd = STDIN_FILENO;
822 stdout_ev.events = EPOLLOUT|EPOLLET;
823 stdout_ev.data.fd = STDOUT_FILENO;
826 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
827 master_ev.data.fd = master;
830 signal_ev.events = EPOLLIN;
831 signal_ev.data.fd = signal_fd;
833 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
834 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
835 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
836 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
837 log_error("Failed to regiser fds in epoll: %m");
843 struct epoll_event ev[16];
847 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
850 if (errno == EINTR || errno == EAGAIN)
853 log_error("epoll_wait(): %m");
860 for (i = 0; i < nfds; i++) {
861 if (ev[i].data.fd == STDIN_FILENO) {
863 if (ev[i].events & (EPOLLIN|EPOLLHUP))
864 stdin_readable = true;
866 } else if (ev[i].data.fd == STDOUT_FILENO) {
868 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
869 stdout_writable = true;
871 } else if (ev[i].data.fd == master) {
873 if (ev[i].events & (EPOLLIN|EPOLLHUP))
874 master_readable = true;
876 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
877 master_writable = true;
879 } else if (ev[i].data.fd == signal_fd) {
880 struct signalfd_siginfo sfsi;
883 n = read(signal_fd, &sfsi, sizeof(sfsi));
884 if (n != sizeof(sfsi)) {
887 log_error("Failed to read from signalfd: invalid block size");
892 if (errno != EINTR && errno != EAGAIN) {
893 log_error("Failed to read from signalfd: %m");
899 if (sfsi.ssi_signo == SIGWINCH) {
902 /* The window size changed, let's forward that. */
903 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
904 ioctl(master, TIOCSWINSZ, &ws);
913 while ((stdin_readable && in_buffer_full <= 0) ||
914 (master_writable && in_buffer_full > 0) ||
915 (master_readable && out_buffer_full <= 0) ||
916 (stdout_writable && out_buffer_full > 0)) {
918 if (stdin_readable && in_buffer_full < LINE_MAX) {
920 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
923 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
924 stdin_readable = false;
926 log_error("read(): %m");
931 in_buffer_full += (size_t) k;
934 if (master_writable && in_buffer_full > 0) {
936 k = write(master, in_buffer, in_buffer_full);
939 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
940 master_writable = false;
942 log_error("write(): %m");
948 assert(in_buffer_full >= (size_t) k);
949 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
954 if (master_readable && out_buffer_full < LINE_MAX) {
956 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
959 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
960 master_readable = false;
962 log_error("read(): %m");
967 out_buffer_full += (size_t) k;
970 if (stdout_writable && out_buffer_full > 0) {
972 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
975 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
976 stdout_writable = false;
978 log_error("write(): %m");
984 assert(out_buffer_full >= (size_t) k);
985 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
986 out_buffer_full -= k;
994 close_nointr_nofail(ep);
997 close_nointr_nofail(signal_fd);
1002 int main(int argc, char *argv[]) {
1004 int r = EXIT_FAILURE, k;
1005 char *oldcg = NULL, *newcg = NULL;
1006 char **controller = NULL;
1008 const char *console = NULL;
1009 struct termios saved_attr, raw_attr;
1011 bool saved_attr_valid = false;
1013 int kmsg_socket_pair[2] = { -1, -1 };
1015 log_parse_environment();
1018 r = parse_argv(argc, argv);
1022 if (arg_directory) {
1025 p = path_make_absolute_cwd(arg_directory);
1026 free(arg_directory);
1029 arg_directory = get_current_dir_name();
1031 if (!arg_directory) {
1032 log_error("Failed to determine path");
1036 path_kill_slashes(arg_directory);
1038 if (geteuid() != 0) {
1039 log_error("Need to be root.");
1043 if (sd_booted() <= 0) {
1044 log_error("Not running on a systemd system.");
1048 if (path_equal(arg_directory, "/")) {
1049 log_error("Spawning container on root directory not supported.");
1053 if (is_os_tree(arg_directory) <= 0) {
1054 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1058 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1060 log_error("Failed to determine current cgroup: %s", strerror(-k));
1064 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1065 log_error("Failed to allocate cgroup path.");
1069 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1071 log_error("Failed to create cgroup: %s", strerror(-k));
1075 STRV_FOREACH(controller, arg_controllers) {
1076 k = cg_create_and_attach(*controller, newcg, 0);
1078 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1081 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1083 log_error("Failed to acquire pseudo tty: %m");
1087 console = ptsname(master);
1089 log_error("Failed to determine tty name: %m");
1093 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1095 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1096 ioctl(master, TIOCSWINSZ, &ws);
1098 if (unlockpt(master) < 0) {
1099 log_error("Failed to unlock tty: %m");
1103 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1104 log_error("Failed to get terminal attributes: %m");
1108 saved_attr_valid = true;
1110 raw_attr = saved_attr;
1111 cfmakeraw(&raw_attr);
1112 raw_attr.c_lflag &= ~ECHO;
1114 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1115 log_error("Failed to set terminal attributes: %m");
1119 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1120 log_error("Failed to create kmsg socket pair");
1124 assert_se(sigemptyset(&mask) == 0);
1125 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1126 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1128 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1130 if (errno == EINVAL)
1131 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1133 log_error("clone() failed: %m");
1141 const char *home = NULL;
1142 uid_t uid = (uid_t) -1;
1143 gid_t gid = (gid_t) -1;
1144 const char *envp[] = {
1145 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1146 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1151 NULL, /* container_uuid */
1155 envp[2] = strv_find_prefix(environ, "TERM=");
1157 close_nointr_nofail(master);
1159 close_nointr(STDIN_FILENO);
1160 close_nointr(STDOUT_FILENO);
1161 close_nointr(STDERR_FILENO);
1163 close_all_fds(&kmsg_socket_pair[1], 1);
1165 reset_all_signal_handlers();
1167 assert_se(sigemptyset(&mask) == 0);
1168 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1170 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1171 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1172 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1176 log_error("setsid() failed: %m");
1180 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1181 log_error("PR_SET_PDEATHSIG failed: %m");
1185 /* Mark everything as slave, so that we still
1186 * receive mounts from the real root, but don't
1187 * propagate mounts to the real root. */
1188 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1189 log_error("MS_SLAVE|MS_REC failed: %m");
1193 /* Turn directory into bind mount */
1194 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1195 log_error("Failed to make bind mount.");
1200 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1201 log_error("Failed to make read-only.");
1205 if (mount_all(arg_directory) < 0)
1208 if (copy_devnodes(arg_directory) < 0)
1211 dev_setup(arg_directory);
1213 if (setup_dev_console(arg_directory, console) < 0)
1216 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1219 close_nointr_nofail(kmsg_socket_pair[1]);
1221 if (setup_timezone(arg_directory) < 0)
1224 if (setup_resolv_conf(arg_directory) < 0)
1227 if (setup_journal(arg_directory) < 0)
1230 if (chdir(arg_directory) < 0) {
1231 log_error("chdir(%s) failed: %m", arg_directory);
1235 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1236 log_error("mount(MS_MOVE) failed: %m");
1240 if (chroot(".") < 0) {
1241 log_error("chroot() failed: %m");
1245 if (chdir("/") < 0) {
1246 log_error("chdir() failed: %m");
1254 if (drop_capabilities() < 0) {
1255 log_error("drop_capabilities() failed: %m");
1261 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1262 log_error("get_user_creds() failed: %m");
1266 if (mkdir_parents_label(home, 0775) < 0) {
1267 log_error("mkdir_parents_label() failed: %m");
1271 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1272 log_error("mkdir_safe_label() failed: %m");
1276 if (initgroups((const char*)arg_user, gid) < 0) {
1277 log_error("initgroups() failed: %m");
1281 if (setresgid(gid, gid, gid) < 0) {
1282 log_error("setregid() failed: %m");
1286 if (setresuid(uid, uid, uid) < 0) {
1287 log_error("setreuid() failed: %m");
1292 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1293 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1294 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1300 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1312 /* Automatically search for the init system */
1314 l = 1 + argc - optind;
1315 a = newa(char*, l + 1);
1316 memcpy(a + 1, argv + optind, l * sizeof(char*));
1318 a[0] = (char*) "/usr/lib/systemd/systemd";
1319 execve(a[0], a, (char**) envp);
1321 a[0] = (char*) "/lib/systemd/systemd";
1322 execve(a[0], a, (char**) envp);
1324 a[0] = (char*) "/sbin/init";
1325 execve(a[0], a, (char**) envp);
1326 } else if (argc > optind)
1327 execvpe(argv[optind], argv + optind, (char**) envp);
1329 chdir(home ? home : "/root");
1330 execle("/bin/bash", "-bash", NULL, (char**) envp);
1333 log_error("execv() failed: %m");
1336 _exit(EXIT_FAILURE);
1339 if (process_pty(master, &mask) < 0)
1342 if (saved_attr_valid) {
1343 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1344 saved_attr_valid = false;
1347 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1353 if (saved_attr_valid)
1354 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1357 close_nointr_nofail(master);
1359 close_pipe(kmsg_socket_pair);
1362 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1365 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1367 free(arg_directory);
1368 strv_free(arg_controllers);