1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include "sd-daemon.h"
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
62 #include "bus-error.h"
64 #include "bus-kernel.h"
71 typedef enum LinkJournal {
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
89 (1ULL << CAP_DAC_OVERRIDE) |
90 (1ULL << CAP_DAC_READ_SEARCH) |
91 (1ULL << CAP_FOWNER) |
92 (1ULL << CAP_FSETID) |
93 (1ULL << CAP_IPC_OWNER) |
96 (1ULL << CAP_LINUX_IMMUTABLE) |
97 (1ULL << CAP_NET_BIND_SERVICE) |
98 (1ULL << CAP_NET_BROADCAST) |
99 (1ULL << CAP_NET_RAW) |
100 (1ULL << CAP_SETGID) |
101 (1ULL << CAP_SETFCAP) |
102 (1ULL << CAP_SETPCAP) |
103 (1ULL << CAP_SETUID) |
104 (1ULL << CAP_SYS_ADMIN) |
105 (1ULL << CAP_SYS_CHROOT) |
106 (1ULL << CAP_SYS_NICE) |
107 (1ULL << CAP_SYS_PTRACE) |
108 (1ULL << CAP_SYS_TTY_CONFIG) |
109 (1ULL << CAP_SYS_RESOURCE) |
110 (1ULL << CAP_SYS_BOOT) |
111 (1ULL << CAP_AUDIT_WRITE) |
112 (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115 static char **arg_setenv = NULL;
117 static int help(void) {
119 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
120 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
121 " -h --help Show this help\n"
122 " --version Print version string\n"
123 " -D --directory=NAME Root directory for the container\n"
124 " -b --boot Boot up full system (i.e. invoke init)\n"
125 " -u --user=USER Run the command under specified user or uid\n"
126 " --uuid=UUID Set a specific machine UUID for the container\n"
127 " -M --machine=NAME Set the machine name for the container\n"
128 " -S --slice=SLICE Place the container in the specified slice\n"
129 " --private-network Disable network in container\n"
130 " --read-only Mount the root directory read-only\n"
131 " --capability=CAP In addition to the default, retain specified\n"
133 " --drop-capability=CAP Drop the specified capability from the default set\n"
134 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
135 " -j Equivalent to --link-journal=host\n"
136 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
138 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
139 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n",
140 program_invocation_short_name);
145 static int parse_argv(int argc, char *argv[]) {
160 static const struct option options[] = {
161 { "help", no_argument, NULL, 'h' },
162 { "version", no_argument, NULL, ARG_VERSION },
163 { "directory", required_argument, NULL, 'D' },
164 { "user", required_argument, NULL, 'u' },
165 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
166 { "boot", no_argument, NULL, 'b' },
167 { "uuid", required_argument, NULL, ARG_UUID },
168 { "read-only", no_argument, NULL, ARG_READ_ONLY },
169 { "capability", required_argument, NULL, ARG_CAPABILITY },
170 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
171 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
172 { "bind", required_argument, NULL, ARG_BIND },
173 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
174 { "machine", required_argument, NULL, 'M' },
175 { "slice", required_argument, NULL, 'S' },
176 { "setenv", required_argument, NULL, ARG_SETENV },
185 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
193 puts(PACKAGE_STRING);
194 puts(SYSTEMD_FEATURES);
199 arg_directory = canonicalize_file_name(optarg);
200 if (!arg_directory) {
201 log_error("Invalid root directory: %m");
209 arg_user = strdup(optarg);
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
224 r = sd_id128_from_string(optarg, &arg_uuid);
226 log_error("Invalid UUID: %s", optarg);
232 arg_slice = strdup(optarg);
239 if (!hostname_is_valid(optarg)) {
240 log_error("Invalid machine name: %s", optarg);
245 arg_machine = strdup(optarg);
252 arg_read_only = true;
256 case ARG_DROP_CAPABILITY: {
260 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
264 t = strndup(word, length);
268 if (cap_from_name(t, &cap) < 0) {
269 log_error("Failed to parse capability %s.", t);
276 if (c == ARG_CAPABILITY)
277 arg_retain |= 1ULL << (uint64_t) cap;
279 arg_retain &= ~(1ULL << (uint64_t) cap);
286 arg_link_journal = LINK_GUEST;
289 case ARG_LINK_JOURNAL:
290 if (streq(optarg, "auto"))
291 arg_link_journal = LINK_AUTO;
292 else if (streq(optarg, "no"))
293 arg_link_journal = LINK_NO;
294 else if (streq(optarg, "guest"))
295 arg_link_journal = LINK_GUEST;
296 else if (streq(optarg, "host"))
297 arg_link_journal = LINK_HOST;
299 log_error("Failed to parse link journal mode %s", optarg);
307 _cleanup_free_ char *a = NULL, *b = NULL;
311 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
313 e = strchr(optarg, ':');
315 a = strndup(optarg, e - optarg);
325 if (!path_is_absolute(a) || !path_is_absolute(b)) {
326 log_error("Invalid bind mount specification: %s", optarg);
330 r = strv_extend(x, a);
334 r = strv_extend(x, b);
344 if (!env_assignment_is_valid(optarg)) {
345 log_error("Environment variable assignment '%s' is not valid.", optarg);
349 n = strv_env_set(arg_setenv, optarg);
353 strv_free(arg_setenv);
362 assert_not_reached("Unhandled option");
369 static int mount_all(const char *dest) {
371 typedef struct MountPoint {
380 static const MountPoint mount_table[] = {
381 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
382 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
383 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
384 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
385 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
386 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
387 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
388 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
390 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
391 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
398 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
399 _cleanup_free_ char *where = NULL;
402 where = strjoin(dest, "/", mount_table[k].where, NULL);
406 t = path_is_mount_point(where, true);
408 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
416 /* Skip this entry if it is not a remount. */
417 if (mount_table[k].what && t > 0)
420 mkdir_p(where, 0755);
422 if (mount(mount_table[k].what,
425 mount_table[k].flags,
426 mount_table[k].options) < 0 &&
427 mount_table[k].fatal) {
429 log_error("mount(%s) failed: %m", where);
439 static int mount_binds(const char *dest, char **l, unsigned long flags) {
442 STRV_FOREACH_PAIR(x, y, l) {
444 struct stat source_st, dest_st;
447 if (stat(*x, &source_st) < 0) {
448 log_error("failed to stat %s: %m", *x);
452 where = strappenda(dest, *y);
453 r = stat(where, &dest_st);
455 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
456 log_error("The file types of %s and %s do not match. Refusing bind mount",
460 } else if (errno == ENOENT) {
461 r = mkdir_parents_label(where, 0755);
463 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
467 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
470 /* Create the mount point, but be conservative -- refuse to create block
471 * and char devices. */
472 if (S_ISDIR(source_st.st_mode))
473 mkdir_label(where, 0755);
474 else if (S_ISFIFO(source_st.st_mode))
476 else if (S_ISSOCK(source_st.st_mode))
477 mknod(where, 0644 | S_IFSOCK, 0);
478 else if (S_ISREG(source_st.st_mode))
481 log_error("Refusing to create mountpoint for file: %s", *x);
485 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
486 log_error("mount(%s) failed: %m", where);
490 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
491 log_error("mount(%s) failed: %m", where);
499 static int setup_timezone(const char *dest) {
500 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
506 /* Fix the timezone, if possible */
507 r = readlink_malloc("/etc/localtime", &p);
509 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
513 z = path_startswith(p, "../usr/share/zoneinfo/");
515 z = path_startswith(p, "/usr/share/zoneinfo/");
517 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
521 where = strappend(dest, "/etc/localtime");
525 r = readlink_malloc(where, &q);
527 y = path_startswith(q, "../usr/share/zoneinfo/");
529 y = path_startswith(q, "/usr/share/zoneinfo/");
532 /* Already pointing to the right place? Then do nothing .. */
533 if (y && streq(y, z))
537 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
541 if (access(check, F_OK) < 0) {
542 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
546 what = strappend("../usr/share/zoneinfo/", z);
551 if (symlink(what, where) < 0) {
552 log_error("Failed to correct timezone of container: %m");
559 static int setup_resolv_conf(const char *dest) {
560 char _cleanup_free_ *where = NULL;
564 if (arg_private_network)
567 /* Fix resolv.conf, if possible */
568 where = strappend(dest, "/etc/resolv.conf");
572 /* We don't really care for the results of this really. If it
573 * fails, it fails, but meh... */
574 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
579 static int setup_boot_id(const char *dest) {
580 _cleanup_free_ char *from = NULL, *to = NULL;
587 /* Generate a new randomized boot ID, so that each boot-up of
588 * the container gets a new one */
590 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
591 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
595 r = sd_id128_randomize(&rnd);
597 log_error("Failed to generate random boot id: %s", strerror(-r));
601 snprintf(as_uuid, sizeof(as_uuid),
602 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
603 SD_ID128_FORMAT_VAL(rnd));
604 char_array_0(as_uuid);
606 r = write_string_file(from, as_uuid);
608 log_error("Failed to write boot id: %s", strerror(-r));
612 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
613 log_error("Failed to bind mount boot id: %m");
615 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
616 log_warning("Failed to make boot id read-only: %m");
622 static int copy_devnodes(const char *dest) {
624 static const char devnodes[] =
634 _cleanup_umask_ mode_t u;
640 NULSTR_FOREACH(d, devnodes) {
642 _cleanup_free_ char *from = NULL, *to = NULL;
644 asprintf(&from, "/dev/%s", d);
645 asprintf(&to, "%s/dev/%s", dest, d);
656 if (stat(from, &st) < 0) {
658 if (errno != ENOENT) {
659 log_error("Failed to stat %s: %m", from);
664 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
666 log_error("%s is not a char or block device, cannot copy", from);
670 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
672 log_error("mknod(%s) failed: %m", dest);
681 static int setup_ptmx(const char *dest) {
682 _cleanup_free_ char *p = NULL;
684 p = strappend(dest, "/dev/ptmx");
688 if (symlink("pts/ptmx", p) < 0) {
689 log_error("Failed to create /dev/ptmx symlink: %m");
696 static int setup_dev_console(const char *dest, const char *console) {
698 _cleanup_free_ char *to = NULL;
700 _cleanup_umask_ mode_t u;
707 if (stat(console, &st) < 0) {
708 log_error("Failed to stat %s: %m", console);
711 } else if (!S_ISCHR(st.st_mode)) {
712 log_error("/dev/console is not a char device");
716 r = chmod_and_chown(console, 0600, 0, 0);
718 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
722 if (asprintf(&to, "%s/dev/console", dest) < 0)
725 /* We need to bind mount the right tty to /dev/console since
726 * ptys can only exist on pts file systems. To have something
727 * to bind mount things on we create a device node first, that
728 * has the right major/minor (note that the major minor
729 * doesn't actually matter here, since we mount it over
732 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
733 log_error("mknod() for /dev/console failed: %m");
737 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
738 log_error("Bind mount for /dev/console failed: %m");
745 static int setup_kmsg(const char *dest, int kmsg_socket) {
746 _cleanup_free_ char *from = NULL, *to = NULL;
748 _cleanup_umask_ mode_t u;
750 struct cmsghdr cmsghdr;
751 uint8_t buf[CMSG_SPACE(sizeof(int))];
754 .msg_control = &control,
755 .msg_controllen = sizeof(control),
757 struct cmsghdr *cmsg;
760 assert(kmsg_socket >= 0);
764 /* We create the kmsg FIFO as /dev/kmsg, but immediately
765 * delete it after bind mounting it to /proc/kmsg. While FIFOs
766 * on the reading side behave very similar to /proc/kmsg,
767 * their writing side behaves differently from /dev/kmsg in
768 * that writing blocks when nothing is reading. In order to
769 * avoid any problems with containers deadlocking due to this
770 * we simply make /dev/kmsg unavailable to the container. */
771 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
772 asprintf(&to, "%s/proc/kmsg", dest) < 0)
775 if (mkfifo(from, 0600) < 0) {
776 log_error("mkfifo() for /dev/kmsg failed: %m");
780 r = chmod_and_chown(from, 0600, 0, 0);
782 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
786 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
787 log_error("Bind mount for /proc/kmsg failed: %m");
791 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
793 log_error("Failed to open fifo: %m");
797 cmsg = CMSG_FIRSTHDR(&mh);
798 cmsg->cmsg_level = SOL_SOCKET;
799 cmsg->cmsg_type = SCM_RIGHTS;
800 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
801 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
803 mh.msg_controllen = cmsg->cmsg_len;
805 /* Store away the fd in the socket, so that it stays open as
806 * long as we run the child */
807 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
808 close_nointr_nofail(fd);
811 log_error("Failed to send FIFO fd: %m");
815 /* And now make the FIFO unavailable as /dev/kmsg... */
820 static int setup_hostname(void) {
822 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
828 static int setup_journal(const char *directory) {
829 sd_id128_t machine_id, this_id;
830 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
834 p = strappend(directory, "/etc/machine-id");
838 r = read_one_line_file(p, &b);
839 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
842 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
847 if (isempty(id) && arg_link_journal == LINK_AUTO)
850 /* Verify validity */
851 r = sd_id128_from_string(id, &machine_id);
853 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
857 r = sd_id128_get_machine(&this_id);
859 log_error("Failed to retrieve machine ID: %s", strerror(-r));
863 if (sd_id128_equal(machine_id, this_id)) {
864 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
865 "Host and machine ids are equal (%s): refusing to link journals", id);
866 if (arg_link_journal == LINK_AUTO)
872 if (arg_link_journal == LINK_NO)
876 p = strappend("/var/log/journal/", id);
877 q = strjoin(directory, "/var/log/journal/", id, NULL);
881 if (path_is_mount_point(p, false) > 0) {
882 if (arg_link_journal != LINK_AUTO) {
883 log_error("%s: already a mount point, refusing to use for journal", p);
890 if (path_is_mount_point(q, false) > 0) {
891 if (arg_link_journal != LINK_AUTO) {
892 log_error("%s: already a mount point, refusing to use for journal", q);
899 r = readlink_and_make_absolute(p, &d);
901 if ((arg_link_journal == LINK_GUEST ||
902 arg_link_journal == LINK_AUTO) &&
905 r = mkdir_p(q, 0755);
907 log_warning("failed to create directory %s: %m", q);
912 log_error("Failed to remove symlink %s: %m", p);
915 } else if (r == -EINVAL) {
917 if (arg_link_journal == LINK_GUEST &&
920 if (errno == ENOTDIR) {
921 log_error("%s already exists and is neither a symlink nor a directory", p);
924 log_error("Failed to remove %s: %m", p);
928 } else if (r != -ENOENT) {
929 log_error("readlink(%s) failed: %m", p);
933 if (arg_link_journal == LINK_GUEST) {
935 if (symlink(q, p) < 0) {
936 log_error("Failed to symlink %s to %s: %m", q, p);
940 r = mkdir_p(q, 0755);
942 log_warning("failed to create directory %s: %m", q);
946 if (arg_link_journal == LINK_HOST) {
947 r = mkdir_p(p, 0755);
949 log_error("Failed to create %s: %m", p);
953 } else if (access(p, F_OK) < 0)
956 if (dir_is_empty(q) == 0) {
957 log_error("%s not empty.", q);
961 r = mkdir_p(q, 0755);
963 log_error("Failed to create %s: %m", q);
967 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
968 log_error("Failed to bind mount journal from host into guest: %m");
975 static int setup_kdbus(const char *dest, const char *path) {
981 p = strappenda(dest, "/dev/kdbus");
982 if (mkdir(p, 0755) < 0) {
983 log_error("Failed to create kdbus path: %m");
987 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
988 log_error("Failed to mount kdbus namespace path: %m");
995 static int drop_capabilities(void) {
996 return capability_bounding_set_drop(~arg_retain, false);
999 static int register_machine(void) {
1000 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1001 _cleanup_bus_unref_ sd_bus *bus = NULL;
1004 r = sd_bus_open_system(&bus);
1006 log_error("Failed to open system bus: %s", strerror(-r));
1010 r = sd_bus_call_method(
1012 "org.freedesktop.machine1",
1013 "/org/freedesktop/machine1",
1014 "org.freedesktop.machine1.Manager",
1020 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1024 strempty(arg_directory),
1025 !isempty(arg_slice), "Slice", "s", arg_slice);
1027 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1034 static int terminate_machine(pid_t pid) {
1035 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1036 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1037 _cleanup_bus_unref_ sd_bus *bus = NULL;
1041 r = sd_bus_default_system(&bus);
1043 log_error("Failed to open system bus: %s", strerror(-r));
1047 r = sd_bus_call_method(
1049 "org.freedesktop.machine1",
1050 "/org/freedesktop/machine1",
1051 "org.freedesktop.machine1.Manager",
1058 /* Note that the machine might already have been
1059 * cleaned up automatically, hence don't consider it a
1060 * failure if we cannot get the machine object. */
1061 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1065 r = sd_bus_message_read(reply, "o", &path);
1067 return bus_log_parse_error(r);
1069 r = sd_bus_call_method(
1071 "org.freedesktop.machine1",
1073 "org.freedesktop.machine1.Machine",
1079 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1086 static bool audit_enabled(void) {
1089 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1091 close_nointr_nofail(fd);
1097 int main(int argc, char *argv[]) {
1099 int r = EXIT_FAILURE, k;
1100 _cleanup_close_ int master = -1, kdbus_fd = -1;
1102 const char *console = NULL;
1104 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1105 _cleanup_fdset_free_ FDSet *fds = NULL;
1106 _cleanup_free_ char *kdbus_namespace = NULL;
1109 log_parse_environment();
1112 k = parse_argv(argc, argv);
1120 if (arg_directory) {
1123 p = path_make_absolute_cwd(arg_directory);
1124 free(arg_directory);
1127 arg_directory = get_current_dir_name();
1129 if (!arg_directory) {
1130 log_error("Failed to determine path, please use -D.");
1134 path_kill_slashes(arg_directory);
1137 arg_machine = strdup(basename(arg_directory));
1143 hostname_cleanup(arg_machine, false);
1144 if (isempty(arg_machine)) {
1145 log_error("Failed to determine machine name automatically, please use -M.");
1150 if (geteuid() != 0) {
1151 log_error("Need to be root.");
1155 if (sd_booted() <= 0) {
1156 log_error("Not running on a systemd system.");
1160 if (arg_boot && audit_enabled()) {
1161 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1162 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1163 "line before using systemd-nspawn. Sleeping for 5s...\n");
1167 if (path_equal(arg_directory, "/")) {
1168 log_error("Spawning container on root directory not supported.");
1172 if (path_is_os_tree(arg_directory) <= 0) {
1173 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1178 n_fd_passed = sd_listen_fds(false);
1179 if (n_fd_passed > 0) {
1180 k = fdset_new_listen_fds(&fds, false);
1182 log_error("Failed to collect file descriptors: %s", strerror(-k));
1186 fdset_close_others(fds);
1189 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1191 log_error("Failed to acquire pseudo tty: %m");
1195 console = ptsname(master);
1197 log_error("Failed to determine tty name: %m");
1201 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1203 if (unlockpt(master) < 0) {
1204 log_error("Failed to unlock tty: %m");
1208 ns = strappenda("machine-", arg_machine);
1209 kdbus_fd = bus_kernel_create_namespace(ns, &kdbus_namespace);
1211 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1213 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1215 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1216 log_error("Failed to create kmsg socket pair.");
1220 sd_notify(0, "READY=1");
1222 assert_se(sigemptyset(&mask) == 0);
1223 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1224 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1229 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1231 if (errno == EINVAL)
1232 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1234 log_error("clone() failed: %m");
1241 const char *home = NULL;
1242 uid_t uid = (uid_t) -1;
1243 gid_t gid = (gid_t) -1;
1245 const char *envp[] = {
1246 "PATH=" DEFAULT_PATH_SPLIT_USR,
1247 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1252 NULL, /* container_uuid */
1253 NULL, /* LISTEN_FDS */
1254 NULL, /* LISTEN_PID */
1259 envp[n_env] = strv_find_prefix(environ, "TERM=");
1263 close_nointr_nofail(master);
1266 close_nointr(STDIN_FILENO);
1267 close_nointr(STDOUT_FILENO);
1268 close_nointr(STDERR_FILENO);
1270 close_nointr_nofail(kmsg_socket_pair[0]);
1271 kmsg_socket_pair[0] = -1;
1273 reset_all_signal_handlers();
1275 assert_se(sigemptyset(&mask) == 0);
1276 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1278 k = open_terminal(console, O_RDWR);
1279 if (k != STDIN_FILENO) {
1281 close_nointr_nofail(k);
1285 log_error("Failed to open console: %s", strerror(-k));
1289 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1290 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1291 log_error("Failed to duplicate console: %m");
1296 log_error("setsid() failed: %m");
1300 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1301 log_error("PR_SET_PDEATHSIG failed: %m");
1305 r = register_machine();
1309 /* Mark everything as slave, so that we still
1310 * receive mounts from the real root, but don't
1311 * propagate mounts to the real root. */
1312 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1313 log_error("MS_SLAVE|MS_REC failed: %m");
1317 /* Turn directory into bind mount */
1318 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1319 log_error("Failed to make bind mount.");
1324 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1325 log_error("Failed to make read-only.");
1329 if (mount_all(arg_directory) < 0)
1332 if (copy_devnodes(arg_directory) < 0)
1335 if (setup_ptmx(arg_directory) < 0)
1338 dev_setup(arg_directory);
1340 if (setup_dev_console(arg_directory, console) < 0)
1343 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1346 close_nointr_nofail(kmsg_socket_pair[1]);
1347 kmsg_socket_pair[1] = -1;
1349 if (setup_boot_id(arg_directory) < 0)
1352 if (setup_timezone(arg_directory) < 0)
1355 if (setup_resolv_conf(arg_directory) < 0)
1358 if (setup_journal(arg_directory) < 0)
1361 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1364 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1367 if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1370 if (chdir(arg_directory) < 0) {
1371 log_error("chdir(%s) failed: %m", arg_directory);
1375 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1376 log_error("mount(MS_MOVE) failed: %m");
1380 if (chroot(".") < 0) {
1381 log_error("chroot() failed: %m");
1385 if (chdir("/") < 0) {
1386 log_error("chdir() failed: %m");
1394 if (drop_capabilities() < 0) {
1395 log_error("drop_capabilities() failed: %m");
1401 /* Note that this resolves user names
1402 * inside the container, and hence
1403 * accesses the NSS modules from the
1404 * container and not the host. This is
1407 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1408 log_error("get_user_creds() failed: %m");
1412 if (mkdir_parents_label(home, 0775) < 0) {
1413 log_error("mkdir_parents_label() failed: %m");
1417 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1418 log_error("mkdir_safe_label() failed: %m");
1422 if (initgroups((const char*)arg_user, gid) < 0) {
1423 log_error("initgroups() failed: %m");
1427 if (setresgid(gid, gid, gid) < 0) {
1428 log_error("setregid() failed: %m");
1432 if (setresuid(uid, uid, uid) < 0) {
1433 log_error("setreuid() failed: %m");
1437 /* Reset everything fully to 0, just in case */
1439 if (setgroups(0, NULL) < 0) {
1440 log_error("setgroups() failed: %m");
1444 if (setresgid(0, 0, 0) < 0) {
1445 log_error("setregid() failed: %m");
1449 if (setresuid(0, 0, 0) < 0) {
1450 log_error("setreuid() failed: %m");
1455 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1456 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1457 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1462 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1463 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1469 if (fdset_size(fds) > 0) {
1470 k = fdset_cloexec(fds, false);
1472 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1476 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1477 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1485 if (!strv_isempty(arg_setenv)) {
1488 n = strv_env_merge(2, envp, arg_setenv);
1496 env_use = (char**) envp;
1502 /* Automatically search for the init system */
1504 l = 1 + argc - optind;
1505 a = newa(char*, l + 1);
1506 memcpy(a + 1, argv + optind, l * sizeof(char*));
1508 a[0] = (char*) "/usr/lib/systemd/systemd";
1509 execve(a[0], a, env_use);
1511 a[0] = (char*) "/lib/systemd/systemd";
1512 execve(a[0], a, env_use);
1514 a[0] = (char*) "/sbin/init";
1515 execve(a[0], a, env_use);
1516 } else if (argc > optind)
1517 execvpe(argv[optind], argv + optind, env_use);
1519 chdir(home ? home : "/root");
1520 execle("/bin/bash", "-bash", NULL, env_use);
1523 log_error("execv() failed: %m");
1526 _exit(EXIT_FAILURE);
1532 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1540 /* Kill if it is not dead yet anyway */
1541 terminate_machine(pid);
1543 /* Redundant, but better safe than sorry */
1546 k = wait_for_terminate(pid, &status);
1554 if (status.si_code == CLD_EXITED) {
1555 r = status.si_status;
1556 if (status.si_status != 0) {
1557 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1561 log_debug("Container %s exited successfully.", arg_machine);
1563 } else if (status.si_code == CLD_KILLED &&
1564 status.si_status == SIGINT) {
1565 log_info("Container %s has been shut down.", arg_machine);
1568 } else if (status.si_code == CLD_KILLED &&
1569 status.si_status == SIGHUP) {
1570 log_info("Container %s is being rebooted.", arg_machine);
1572 } else if (status.si_code == CLD_KILLED ||
1573 status.si_code == CLD_DUMPED) {
1575 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1579 log_error("Container %s failed due to unknown reason.", arg_machine);
1589 free(arg_directory);