1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include "sd-daemon.h"
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
62 #include "bus-error.h"
64 #include "bus-kernel.h"
70 typedef enum LinkJournal {
77 static char *arg_directory = NULL;
78 static char *arg_user = NULL;
79 static sd_id128_t arg_uuid = {};
80 static char *arg_machine = NULL;
81 static const char *arg_slice = NULL;
82 static bool arg_private_network = false;
83 static bool arg_read_only = false;
84 static bool arg_boot = false;
85 static LinkJournal arg_link_journal = LINK_AUTO;
86 static uint64_t arg_retain =
88 (1ULL << CAP_DAC_OVERRIDE) |
89 (1ULL << CAP_DAC_READ_SEARCH) |
90 (1ULL << CAP_FOWNER) |
91 (1ULL << CAP_FSETID) |
92 (1ULL << CAP_IPC_OWNER) |
95 (1ULL << CAP_LINUX_IMMUTABLE) |
96 (1ULL << CAP_NET_BIND_SERVICE) |
97 (1ULL << CAP_NET_BROADCAST) |
98 (1ULL << CAP_NET_RAW) |
99 (1ULL << CAP_SETGID) |
100 (1ULL << CAP_SETFCAP) |
101 (1ULL << CAP_SETPCAP) |
102 (1ULL << CAP_SETUID) |
103 (1ULL << CAP_SYS_ADMIN) |
104 (1ULL << CAP_SYS_CHROOT) |
105 (1ULL << CAP_SYS_NICE) |
106 (1ULL << CAP_SYS_PTRACE) |
107 (1ULL << CAP_SYS_TTY_CONFIG) |
108 (1ULL << CAP_SYS_RESOURCE) |
109 (1ULL << CAP_SYS_BOOT) |
110 (1ULL << CAP_AUDIT_WRITE) |
111 (1ULL << CAP_AUDIT_CONTROL);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
115 static int help(void) {
117 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
118 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
119 " -h --help Show this help\n"
120 " --version Print version string\n"
121 " -D --directory=NAME Root directory for the container\n"
122 " -b --boot Boot up full system (i.e. invoke init)\n"
123 " -u --user=USER Run the command under specified user or uid\n"
124 " --uuid=UUID Set a specific machine UUID for the container\n"
125 " -M --machine=NAME Set the machine name for the container\n"
126 " -S --slice=SLICE Place the container in the specified slice\n"
127 " --private-network Disable network in container\n"
128 " --read-only Mount the root directory read-only\n"
129 " --capability=CAP In addition to the default, retain specified\n"
131 " --drop-capability=CAP Drop the specified capability from the default set\n"
132 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
133 " -j Equivalent to --link-journal=host\n"
134 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
136 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137 program_invocation_short_name);
142 static int parse_argv(int argc, char *argv[]) {
156 static const struct option options[] = {
157 { "help", no_argument, NULL, 'h' },
158 { "version", no_argument, NULL, ARG_VERSION },
159 { "directory", required_argument, NULL, 'D' },
160 { "user", required_argument, NULL, 'u' },
161 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
162 { "boot", no_argument, NULL, 'b' },
163 { "uuid", required_argument, NULL, ARG_UUID },
164 { "read-only", no_argument, NULL, ARG_READ_ONLY },
165 { "capability", required_argument, NULL, ARG_CAPABILITY },
166 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
167 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
168 { "bind", required_argument, NULL, ARG_BIND },
169 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
170 { "machine", required_argument, NULL, 'M' },
171 { "slice", required_argument, NULL, 'S' },
180 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
188 puts(PACKAGE_STRING);
189 puts(SYSTEMD_FEATURES);
194 arg_directory = canonicalize_file_name(optarg);
195 if (!arg_directory) {
196 log_error("Invalid root directory: %m");
204 arg_user = strdup(optarg);
210 case ARG_PRIVATE_NETWORK:
211 arg_private_network = true;
219 r = sd_id128_from_string(optarg, &arg_uuid);
221 log_error("Invalid UUID: %s", optarg);
227 arg_slice = strdup(optarg);
234 if (!hostname_is_valid(optarg)) {
235 log_error("Invalid machine name: %s", optarg);
240 arg_machine = strdup(optarg);
247 arg_read_only = true;
251 case ARG_DROP_CAPABILITY: {
255 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
259 t = strndup(word, length);
263 if (cap_from_name(t, &cap) < 0) {
264 log_error("Failed to parse capability %s.", t);
271 if (c == ARG_CAPABILITY)
272 arg_retain |= 1ULL << (uint64_t) cap;
274 arg_retain &= ~(1ULL << (uint64_t) cap);
281 arg_link_journal = LINK_GUEST;
284 case ARG_LINK_JOURNAL:
285 if (streq(optarg, "auto"))
286 arg_link_journal = LINK_AUTO;
287 else if (streq(optarg, "no"))
288 arg_link_journal = LINK_NO;
289 else if (streq(optarg, "guest"))
290 arg_link_journal = LINK_GUEST;
291 else if (streq(optarg, "host"))
292 arg_link_journal = LINK_HOST;
294 log_error("Failed to parse link journal mode %s", optarg);
302 _cleanup_free_ char *a = NULL, *b = NULL;
306 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
308 e = strchr(optarg, ':');
310 a = strndup(optarg, e - optarg);
320 if (!path_is_absolute(a) || !path_is_absolute(b)) {
321 log_error("Invalid bind mount specification: %s", optarg);
325 r = strv_extend(x, a);
329 r = strv_extend(x, b);
340 assert_not_reached("Unhandled option");
347 static int mount_all(const char *dest) {
349 typedef struct MountPoint {
358 static const MountPoint mount_table[] = {
359 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
360 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
361 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
362 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
363 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
364 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
366 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
368 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
369 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
376 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377 _cleanup_free_ char *where = NULL;
380 where = strjoin(dest, "/", mount_table[k].where, NULL);
384 t = path_is_mount_point(where, true);
386 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
394 /* Skip this entry if it is not a remount. */
395 if (mount_table[k].what && t > 0)
398 mkdir_p(where, 0755);
400 if (mount(mount_table[k].what,
403 mount_table[k].flags,
404 mount_table[k].options) < 0 &&
405 mount_table[k].fatal) {
407 log_error("mount(%s) failed: %m", where);
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
420 STRV_FOREACH_PAIR(x, y, l) {
421 _cleanup_free_ char *where = NULL;
422 struct stat source_st, dest_st;
424 if (stat(*x, &source_st) < 0) {
425 log_error("failed to stat %s: %m", *x);
429 where = strjoin(dest, "/", *y, NULL);
433 if (stat(where, &dest_st) == 0) {
434 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
435 log_error("The file types of %s and %s do not match. Refusing bind mount",
440 /* Create the mount point, but be conservative -- refuse to create block
441 * and char devices. */
442 if (S_ISDIR(source_st.st_mode))
443 mkdir_p_label(where, 0755);
444 else if (S_ISFIFO(source_st.st_mode))
446 else if (S_ISSOCK(source_st.st_mode))
447 mknod(where, 0644 | S_IFSOCK, 0);
448 else if (S_ISREG(source_st.st_mode))
451 log_error("Refusing to create mountpoint for file: %s", *x);
456 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
457 log_error("mount(%s) failed: %m", where);
461 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
462 log_error("mount(%s) failed: %m", where);
470 static int setup_timezone(const char *dest) {
471 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
477 /* Fix the timezone, if possible */
478 r = readlink_malloc("/etc/localtime", &p);
480 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
484 z = path_startswith(p, "../usr/share/zoneinfo/");
486 z = path_startswith(p, "/usr/share/zoneinfo/");
488 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
492 where = strappend(dest, "/etc/localtime");
496 r = readlink_malloc(where, &q);
498 y = path_startswith(q, "../usr/share/zoneinfo/");
500 y = path_startswith(q, "/usr/share/zoneinfo/");
503 /* Already pointing to the right place? Then do nothing .. */
504 if (y && streq(y, z))
508 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
512 if (access(check, F_OK) < 0) {
513 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
517 what = strappend("../usr/share/zoneinfo/", z);
522 if (symlink(what, where) < 0) {
523 log_error("Failed to correct timezone of container: %m");
530 static int setup_resolv_conf(const char *dest) {
531 char _cleanup_free_ *where = NULL;
535 if (arg_private_network)
538 /* Fix resolv.conf, if possible */
539 where = strappend(dest, "/etc/resolv.conf");
543 /* We don't really care for the results of this really. If it
544 * fails, it fails, but meh... */
545 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
550 static int setup_boot_id(const char *dest) {
551 _cleanup_free_ char *from = NULL, *to = NULL;
558 /* Generate a new randomized boot ID, so that each boot-up of
559 * the container gets a new one */
561 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
562 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
566 r = sd_id128_randomize(&rnd);
568 log_error("Failed to generate random boot id: %s", strerror(-r));
572 snprintf(as_uuid, sizeof(as_uuid),
573 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
574 SD_ID128_FORMAT_VAL(rnd));
575 char_array_0(as_uuid);
577 r = write_string_file(from, as_uuid);
579 log_error("Failed to write boot id: %s", strerror(-r));
583 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
584 log_error("Failed to bind mount boot id: %m");
586 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
587 log_warning("Failed to make boot id read-only: %m");
593 static int copy_devnodes(const char *dest) {
595 static const char devnodes[] =
605 _cleanup_umask_ mode_t u;
611 NULSTR_FOREACH(d, devnodes) {
613 _cleanup_free_ char *from = NULL, *to = NULL;
615 asprintf(&from, "/dev/%s", d);
616 asprintf(&to, "%s/dev/%s", dest, d);
627 if (stat(from, &st) < 0) {
629 if (errno != ENOENT) {
630 log_error("Failed to stat %s: %m", from);
635 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
637 log_error("%s is not a char or block device, cannot copy", from);
641 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
643 log_error("mknod(%s) failed: %m", dest);
652 static int setup_ptmx(const char *dest) {
653 _cleanup_free_ char *p = NULL;
655 p = strappend(dest, "/dev/ptmx");
659 if (symlink("pts/ptmx", p) < 0) {
660 log_error("Failed to create /dev/ptmx symlink: %m");
667 static int setup_dev_console(const char *dest, const char *console) {
669 _cleanup_free_ char *to = NULL;
671 _cleanup_umask_ mode_t u;
678 if (stat(console, &st) < 0) {
679 log_error("Failed to stat %s: %m", console);
682 } else if (!S_ISCHR(st.st_mode)) {
683 log_error("/dev/console is not a char device");
687 r = chmod_and_chown(console, 0600, 0, 0);
689 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
693 if (asprintf(&to, "%s/dev/console", dest) < 0)
696 /* We need to bind mount the right tty to /dev/console since
697 * ptys can only exist on pts file systems. To have something
698 * to bind mount things on we create a device node first, that
699 * has the right major/minor (note that the major minor
700 * doesn't actually matter here, since we mount it over
703 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
704 log_error("mknod() for /dev/console failed: %m");
708 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
709 log_error("Bind mount for /dev/console failed: %m");
716 static int setup_kmsg(const char *dest, int kmsg_socket) {
717 _cleanup_free_ char *from = NULL, *to = NULL;
719 _cleanup_umask_ mode_t u;
721 struct cmsghdr cmsghdr;
722 uint8_t buf[CMSG_SPACE(sizeof(int))];
725 .msg_control = &control,
726 .msg_controllen = sizeof(control),
728 struct cmsghdr *cmsg;
731 assert(kmsg_socket >= 0);
735 /* We create the kmsg FIFO as /dev/kmsg, but immediately
736 * delete it after bind mounting it to /proc/kmsg. While FIFOs
737 * on the reading side behave very similar to /proc/kmsg,
738 * their writing side behaves differently from /dev/kmsg in
739 * that writing blocks when nothing is reading. In order to
740 * avoid any problems with containers deadlocking due to this
741 * we simply make /dev/kmsg unavailable to the container. */
742 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
743 asprintf(&to, "%s/proc/kmsg", dest) < 0)
746 if (mkfifo(from, 0600) < 0) {
747 log_error("mkfifo() for /dev/kmsg failed: %m");
751 r = chmod_and_chown(from, 0600, 0, 0);
753 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
757 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
758 log_error("Bind mount for /proc/kmsg failed: %m");
762 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
764 log_error("Failed to open fifo: %m");
768 cmsg = CMSG_FIRSTHDR(&mh);
769 cmsg->cmsg_level = SOL_SOCKET;
770 cmsg->cmsg_type = SCM_RIGHTS;
771 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
772 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
774 mh.msg_controllen = cmsg->cmsg_len;
776 /* Store away the fd in the socket, so that it stays open as
777 * long as we run the child */
778 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
779 close_nointr_nofail(fd);
782 log_error("Failed to send FIFO fd: %m");
786 /* And now make the FIFO unavailable as /dev/kmsg... */
791 static int setup_hostname(void) {
793 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
799 static int setup_journal(const char *directory) {
800 sd_id128_t machine_id;
801 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
805 if (arg_link_journal == LINK_NO)
808 p = strappend(directory, "/etc/machine-id");
812 r = read_one_line_file(p, &b);
813 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
816 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
821 if (isempty(id) && arg_link_journal == LINK_AUTO)
824 /* Verify validity */
825 r = sd_id128_from_string(id, &machine_id);
827 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
832 p = strappend("/var/log/journal/", id);
833 q = strjoin(directory, "/var/log/journal/", id, NULL);
837 if (path_is_mount_point(p, false) > 0) {
838 if (arg_link_journal != LINK_AUTO) {
839 log_error("%s: already a mount point, refusing to use for journal", p);
846 if (path_is_mount_point(q, false) > 0) {
847 if (arg_link_journal != LINK_AUTO) {
848 log_error("%s: already a mount point, refusing to use for journal", q);
855 r = readlink_and_make_absolute(p, &d);
857 if ((arg_link_journal == LINK_GUEST ||
858 arg_link_journal == LINK_AUTO) &&
861 r = mkdir_p(q, 0755);
863 log_warning("failed to create directory %s: %m", q);
868 log_error("Failed to remove symlink %s: %m", p);
871 } else if (r == -EINVAL) {
873 if (arg_link_journal == LINK_GUEST &&
876 if (errno == ENOTDIR) {
877 log_error("%s already exists and is neither a symlink nor a directory", p);
880 log_error("Failed to remove %s: %m", p);
884 } else if (r != -ENOENT) {
885 log_error("readlink(%s) failed: %m", p);
889 if (arg_link_journal == LINK_GUEST) {
891 if (symlink(q, p) < 0) {
892 log_error("Failed to symlink %s to %s: %m", q, p);
896 r = mkdir_p(q, 0755);
898 log_warning("failed to create directory %s: %m", q);
902 if (arg_link_journal == LINK_HOST) {
903 r = mkdir_p(p, 0755);
905 log_error("Failed to create %s: %m", p);
909 } else if (access(p, F_OK) < 0)
912 if (dir_is_empty(q) == 0) {
913 log_error("%s not empty.", q);
917 r = mkdir_p(q, 0755);
919 log_error("Failed to create %s: %m", q);
923 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
924 log_error("Failed to bind mount journal from host into guest: %m");
931 static int setup_kdbus(const char *dest, const char *path) {
937 p = strappenda(dest, "/dev/kdbus");
938 if (mkdir(p, 0755) < 0) {
939 log_error("Failed to create kdbus path: %m");
943 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
944 log_error("Failed to mount kdbus namespace path: %m");
951 static int drop_capabilities(void) {
952 return capability_bounding_set_drop(~arg_retain, false);
955 static int register_machine(void) {
956 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
957 _cleanup_bus_unref_ sd_bus *bus = NULL;
960 r = sd_bus_open_system(&bus);
962 log_error("Failed to open system bus: %s", strerror(-r));
966 r = sd_bus_call_method(
968 "org.freedesktop.machine1",
969 "/org/freedesktop/machine1",
970 "org.freedesktop.machine1.Manager",
976 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
980 strempty(arg_directory),
981 !isempty(arg_slice), "Slice", "s", arg_slice);
983 log_error("Failed to register machine: %s", bus_error_message(&error, r));
990 static int terminate_machine(pid_t pid) {
991 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
992 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
993 _cleanup_bus_unref_ sd_bus *bus = NULL;
997 r = sd_bus_default_system(&bus);
999 log_error("Failed to open system bus: %s", strerror(-r));
1003 r = sd_bus_call_method(
1005 "org.freedesktop.machine1",
1006 "/org/freedesktop/machine1",
1007 "org.freedesktop.machine1.Manager",
1014 /* Note that the machine might already have been
1015 * cleaned up automatically, hence don't consider it a
1016 * failure if we cannot get the machine object. */
1017 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1021 r = sd_bus_message_read(reply, "o", &path);
1023 return bus_log_parse_error(r);
1025 r = sd_bus_call_method(
1027 "org.freedesktop.machine1",
1029 "org.freedesktop.machine1.Machine",
1035 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1042 static bool audit_enabled(void) {
1045 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1047 close_nointr_nofail(fd);
1053 int main(int argc, char *argv[]) {
1055 int r = EXIT_FAILURE, k;
1056 _cleanup_close_ int master = -1, kdbus_fd = -1;
1058 const char *console = NULL;
1060 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1061 _cleanup_fdset_free_ FDSet *fds = NULL;
1062 _cleanup_free_ char *kdbus_namespace = NULL;
1064 log_parse_environment();
1067 k = parse_argv(argc, argv);
1075 if (arg_directory) {
1078 p = path_make_absolute_cwd(arg_directory);
1079 free(arg_directory);
1082 arg_directory = get_current_dir_name();
1084 if (!arg_directory) {
1085 log_error("Failed to determine path, please use -D.");
1089 path_kill_slashes(arg_directory);
1092 arg_machine = strdup(path_get_file_name(arg_directory));
1098 hostname_cleanup(arg_machine, false);
1099 if (isempty(arg_machine)) {
1100 log_error("Failed to determine machine name automatically, please use -M.");
1105 if (geteuid() != 0) {
1106 log_error("Need to be root.");
1110 if (sd_booted() <= 0) {
1111 log_error("Not running on a systemd system.");
1115 if (arg_boot && audit_enabled()) {
1116 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1117 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1118 "line before using systemd-nspawn. Sleeping for 5s...\n");
1122 if (path_equal(arg_directory, "/")) {
1123 log_error("Spawning container on root directory not supported.");
1127 if (path_is_os_tree(arg_directory) <= 0) {
1128 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1133 n_fd_passed = sd_listen_fds(false);
1134 if (n_fd_passed > 0) {
1135 k = fdset_new_listen_fds(&fds, false);
1137 log_error("Failed to collect file descriptors: %s", strerror(-k));
1141 fdset_close_others(fds);
1144 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1146 log_error("Failed to acquire pseudo tty: %m");
1150 console = ptsname(master);
1152 log_error("Failed to determine tty name: %m");
1156 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1158 if (unlockpt(master) < 0) {
1159 log_error("Failed to unlock tty: %m");
1163 kdbus_fd = bus_kernel_create_namespace(arg_machine, &kdbus_namespace);
1165 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1167 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1169 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1170 log_error("Failed to create kmsg socket pair.");
1174 sd_notify(0, "READY=1");
1176 assert_se(sigemptyset(&mask) == 0);
1177 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1178 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1183 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1185 if (errno == EINVAL)
1186 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1188 log_error("clone() failed: %m");
1195 const char *home = NULL;
1196 uid_t uid = (uid_t) -1;
1197 gid_t gid = (gid_t) -1;
1199 const char *envp[] = {
1200 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1201 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1206 NULL, /* container_uuid */
1207 NULL, /* LISTEN_FDS */
1208 NULL, /* LISTEN_PID */
1212 envp[n_env] = strv_find_prefix(environ, "TERM=");
1216 close_nointr_nofail(master);
1219 close_nointr(STDIN_FILENO);
1220 close_nointr(STDOUT_FILENO);
1221 close_nointr(STDERR_FILENO);
1223 close_nointr_nofail(kmsg_socket_pair[0]);
1224 kmsg_socket_pair[0] = -1;
1226 reset_all_signal_handlers();
1228 assert_se(sigemptyset(&mask) == 0);
1229 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1231 k = open_terminal(console, O_RDWR);
1232 if (k != STDIN_FILENO) {
1234 close_nointr_nofail(k);
1238 log_error("Failed to open console: %s", strerror(-k));
1242 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1243 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1244 log_error("Failed to duplicate console: %m");
1249 log_error("setsid() failed: %m");
1253 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1254 log_error("PR_SET_PDEATHSIG failed: %m");
1258 r = register_machine();
1262 /* Mark everything as slave, so that we still
1263 * receive mounts from the real root, but don't
1264 * propagate mounts to the real root. */
1265 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1266 log_error("MS_SLAVE|MS_REC failed: %m");
1270 /* Turn directory into bind mount */
1271 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1272 log_error("Failed to make bind mount.");
1277 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1278 log_error("Failed to make read-only.");
1282 if (mount_all(arg_directory) < 0)
1285 if (copy_devnodes(arg_directory) < 0)
1288 if (setup_ptmx(arg_directory) < 0)
1291 dev_setup(arg_directory);
1293 if (setup_dev_console(arg_directory, console) < 0)
1296 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1299 close_nointr_nofail(kmsg_socket_pair[1]);
1300 kmsg_socket_pair[1] = -1;
1302 if (setup_boot_id(arg_directory) < 0)
1305 if (setup_timezone(arg_directory) < 0)
1308 if (setup_resolv_conf(arg_directory) < 0)
1311 if (setup_journal(arg_directory) < 0)
1314 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1317 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1320 if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1323 if (chdir(arg_directory) < 0) {
1324 log_error("chdir(%s) failed: %m", arg_directory);
1328 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1329 log_error("mount(MS_MOVE) failed: %m");
1333 if (chroot(".") < 0) {
1334 log_error("chroot() failed: %m");
1338 if (chdir("/") < 0) {
1339 log_error("chdir() failed: %m");
1347 if (drop_capabilities() < 0) {
1348 log_error("drop_capabilities() failed: %m");
1354 /* Note that this resolves user names
1355 * inside the container, and hence
1356 * accesses the NSS modules from the
1357 * container and not the host. This is
1360 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1361 log_error("get_user_creds() failed: %m");
1365 if (mkdir_parents_label(home, 0775) < 0) {
1366 log_error("mkdir_parents_label() failed: %m");
1370 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1371 log_error("mkdir_safe_label() failed: %m");
1375 if (initgroups((const char*)arg_user, gid) < 0) {
1376 log_error("initgroups() failed: %m");
1380 if (setresgid(gid, gid, gid) < 0) {
1381 log_error("setregid() failed: %m");
1385 if (setresuid(uid, uid, uid) < 0) {
1386 log_error("setreuid() failed: %m");
1390 /* Reset everything fully to 0, just in case */
1392 if (setgroups(0, NULL) < 0) {
1393 log_error("setgroups() failed: %m");
1397 if (setresgid(0, 0, 0) < 0) {
1398 log_error("setregid() failed: %m");
1402 if (setresuid(0, 0, 0) < 0) {
1403 log_error("setreuid() failed: %m");
1408 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1409 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1410 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1415 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1416 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1422 if (fdset_size(fds) > 0) {
1423 k = fdset_cloexec(fds, false);
1425 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1429 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1430 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1442 /* Automatically search for the init system */
1444 l = 1 + argc - optind;
1445 a = newa(char*, l + 1);
1446 memcpy(a + 1, argv + optind, l * sizeof(char*));
1448 a[0] = (char*) "/usr/lib/systemd/systemd";
1449 execve(a[0], a, (char**) envp);
1451 a[0] = (char*) "/lib/systemd/systemd";
1452 execve(a[0], a, (char**) envp);
1454 a[0] = (char*) "/sbin/init";
1455 execve(a[0], a, (char**) envp);
1456 } else if (argc > optind)
1457 execvpe(argv[optind], argv + optind, (char**) envp);
1459 chdir(home ? home : "/root");
1460 execle("/bin/bash", "-bash", NULL, (char**) envp);
1463 log_error("execv() failed: %m");
1466 _exit(EXIT_FAILURE);
1472 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1480 /* Kill if it is not dead yet anyway */
1481 terminate_machine(pid);
1483 /* Redundant, but better safe than sorry */
1486 k = wait_for_terminate(pid, &status);
1494 if (status.si_code == CLD_EXITED) {
1495 r = status.si_status;
1496 if (status.si_status != 0) {
1497 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1501 log_debug("Container %s exited successfully.", arg_machine);
1503 } else if (status.si_code == CLD_KILLED &&
1504 status.si_status == SIGINT) {
1505 log_info("Container %s has been shut down.", arg_machine);
1508 } else if (status.si_code == CLD_KILLED &&
1509 status.si_status == SIGHUP) {
1510 log_info("Container %s is being rebooted.", arg_machine);
1512 } else if (status.si_code == CLD_KILLED ||
1513 status.si_code == CLD_DUMPED) {
1515 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1519 log_error("Container %s failed due to unknown reason.", arg_machine);
1529 free(arg_directory);