1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include "sd-daemon.h"
54 #include "cgroup-util.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
58 #include "dev-setup.h"
63 #include "bus-error.h"
65 #include "bus-kernel.h"
69 typedef enum LinkJournal {
76 static char *arg_directory = NULL;
77 static char *arg_user = NULL;
78 static sd_id128_t arg_uuid = {};
79 static char *arg_machine = NULL;
80 static const char *arg_slice = NULL;
81 static bool arg_private_network = false;
82 static bool arg_read_only = false;
83 static bool arg_boot = false;
84 static LinkJournal arg_link_journal = LINK_AUTO;
85 static uint64_t arg_retain =
87 (1ULL << CAP_DAC_OVERRIDE) |
88 (1ULL << CAP_DAC_READ_SEARCH) |
89 (1ULL << CAP_FOWNER) |
90 (1ULL << CAP_FSETID) |
91 (1ULL << CAP_IPC_OWNER) |
94 (1ULL << CAP_LINUX_IMMUTABLE) |
95 (1ULL << CAP_NET_BIND_SERVICE) |
96 (1ULL << CAP_NET_BROADCAST) |
97 (1ULL << CAP_NET_RAW) |
98 (1ULL << CAP_SETGID) |
99 (1ULL << CAP_SETFCAP) |
100 (1ULL << CAP_SETPCAP) |
101 (1ULL << CAP_SETUID) |
102 (1ULL << CAP_SYS_ADMIN) |
103 (1ULL << CAP_SYS_CHROOT) |
104 (1ULL << CAP_SYS_NICE) |
105 (1ULL << CAP_SYS_PTRACE) |
106 (1ULL << CAP_SYS_TTY_CONFIG) |
107 (1ULL << CAP_SYS_RESOURCE) |
108 (1ULL << CAP_SYS_BOOT) |
109 (1ULL << CAP_AUDIT_WRITE) |
110 (1ULL << CAP_AUDIT_CONTROL) |
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
114 static char **arg_setenv = NULL;
116 static int help(void) {
118 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120 " -h --help Show this help\n"
121 " --version Print version string\n"
122 " -D --directory=NAME Root directory for the container\n"
123 " -b --boot Boot up full system (i.e. invoke init)\n"
124 " -u --user=USER Run the command under specified user or uid\n"
125 " --uuid=UUID Set a specific machine UUID for the container\n"
126 " -M --machine=NAME Set the machine name for the container\n"
127 " -S --slice=SLICE Place the container in the specified slice\n"
128 " --private-network Disable network in container\n"
129 " --read-only Mount the root directory read-only\n"
130 " --capability=CAP In addition to the default, retain specified\n"
132 " --drop-capability=CAP Drop the specified capability from the default set\n"
133 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
134 " -j Equivalent to --link-journal=host\n"
135 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
137 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
138 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n",
139 program_invocation_short_name);
144 static int parse_argv(int argc, char *argv[]) {
159 static const struct option options[] = {
160 { "help", no_argument, NULL, 'h' },
161 { "version", no_argument, NULL, ARG_VERSION },
162 { "directory", required_argument, NULL, 'D' },
163 { "user", required_argument, NULL, 'u' },
164 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
165 { "boot", no_argument, NULL, 'b' },
166 { "uuid", required_argument, NULL, ARG_UUID },
167 { "read-only", no_argument, NULL, ARG_READ_ONLY },
168 { "capability", required_argument, NULL, ARG_CAPABILITY },
169 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
170 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
171 { "bind", required_argument, NULL, ARG_BIND },
172 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
173 { "machine", required_argument, NULL, 'M' },
174 { "slice", required_argument, NULL, 'S' },
175 { "setenv", required_argument, NULL, ARG_SETENV },
184 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
192 puts(PACKAGE_STRING);
193 puts(SYSTEMD_FEATURES);
198 arg_directory = canonicalize_file_name(optarg);
199 if (!arg_directory) {
200 log_error("Invalid root directory: %m");
208 arg_user = strdup(optarg);
214 case ARG_PRIVATE_NETWORK:
215 arg_private_network = true;
223 r = sd_id128_from_string(optarg, &arg_uuid);
225 log_error("Invalid UUID: %s", optarg);
231 arg_slice = strdup(optarg);
238 if (!hostname_is_valid(optarg)) {
239 log_error("Invalid machine name: %s", optarg);
244 arg_machine = strdup(optarg);
251 arg_read_only = true;
255 case ARG_DROP_CAPABILITY: {
259 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
263 t = strndup(word, length);
267 if (cap_from_name(t, &cap) < 0) {
268 log_error("Failed to parse capability %s.", t);
275 if (c == ARG_CAPABILITY)
276 arg_retain |= 1ULL << (uint64_t) cap;
278 arg_retain &= ~(1ULL << (uint64_t) cap);
285 arg_link_journal = LINK_GUEST;
288 case ARG_LINK_JOURNAL:
289 if (streq(optarg, "auto"))
290 arg_link_journal = LINK_AUTO;
291 else if (streq(optarg, "no"))
292 arg_link_journal = LINK_NO;
293 else if (streq(optarg, "guest"))
294 arg_link_journal = LINK_GUEST;
295 else if (streq(optarg, "host"))
296 arg_link_journal = LINK_HOST;
298 log_error("Failed to parse link journal mode %s", optarg);
306 _cleanup_free_ char *a = NULL, *b = NULL;
310 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
312 e = strchr(optarg, ':');
314 a = strndup(optarg, e - optarg);
324 if (!path_is_absolute(a) || !path_is_absolute(b)) {
325 log_error("Invalid bind mount specification: %s", optarg);
329 r = strv_extend(x, a);
333 r = strv_extend(x, b);
343 if (!env_assignment_is_valid(optarg)) {
344 log_error("Environment variable assignment '%s' is not valid.", optarg);
348 n = strv_env_set(arg_setenv, optarg);
352 strv_free(arg_setenv);
361 assert_not_reached("Unhandled option");
368 static int mount_all(const char *dest) {
370 typedef struct MountPoint {
379 static const MountPoint mount_table[] = {
380 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
381 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
382 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
383 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
384 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
385 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
386 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
387 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
389 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
390 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
397 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
398 _cleanup_free_ char *where = NULL;
401 where = strjoin(dest, "/", mount_table[k].where, NULL);
405 t = path_is_mount_point(where, true);
407 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
415 /* Skip this entry if it is not a remount. */
416 if (mount_table[k].what && t > 0)
419 mkdir_p(where, 0755);
421 if (mount(mount_table[k].what,
424 mount_table[k].flags,
425 mount_table[k].options) < 0 &&
426 mount_table[k].fatal) {
428 log_error("mount(%s) failed: %m", where);
438 static int mount_binds(const char *dest, char **l, unsigned long flags) {
441 STRV_FOREACH_PAIR(x, y, l) {
443 struct stat source_st, dest_st;
446 if (stat(*x, &source_st) < 0) {
447 log_error("failed to stat %s: %m", *x);
451 where = strappenda(dest, *y);
452 r = stat(where, &dest_st);
454 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
455 log_error("The file types of %s and %s do not match. Refusing bind mount",
459 } else if (errno == ENOENT) {
460 r = mkdir_parents_label(where, 0755);
462 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
466 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
469 /* Create the mount point, but be conservative -- refuse to create block
470 * and char devices. */
471 if (S_ISDIR(source_st.st_mode))
472 mkdir_label(where, 0755);
473 else if (S_ISFIFO(source_st.st_mode))
475 else if (S_ISSOCK(source_st.st_mode))
476 mknod(where, 0644 | S_IFSOCK, 0);
477 else if (S_ISREG(source_st.st_mode))
480 log_error("Refusing to create mountpoint for file: %s", *x);
484 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
485 log_error("mount(%s) failed: %m", where);
489 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
490 log_error("mount(%s) failed: %m", where);
498 static int setup_timezone(const char *dest) {
499 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
505 /* Fix the timezone, if possible */
506 r = readlink_malloc("/etc/localtime", &p);
508 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
512 z = path_startswith(p, "../usr/share/zoneinfo/");
514 z = path_startswith(p, "/usr/share/zoneinfo/");
516 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
520 where = strappend(dest, "/etc/localtime");
524 r = readlink_malloc(where, &q);
526 y = path_startswith(q, "../usr/share/zoneinfo/");
528 y = path_startswith(q, "/usr/share/zoneinfo/");
531 /* Already pointing to the right place? Then do nothing .. */
532 if (y && streq(y, z))
536 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
540 if (access(check, F_OK) < 0) {
541 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
545 what = strappend("../usr/share/zoneinfo/", z);
550 if (symlink(what, where) < 0) {
551 log_error("Failed to correct timezone of container: %m");
558 static int setup_resolv_conf(const char *dest) {
559 char _cleanup_free_ *where = NULL;
563 if (arg_private_network)
566 /* Fix resolv.conf, if possible */
567 where = strappend(dest, "/etc/resolv.conf");
571 /* We don't really care for the results of this really. If it
572 * fails, it fails, but meh... */
573 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
578 static int setup_boot_id(const char *dest) {
579 _cleanup_free_ char *from = NULL, *to = NULL;
586 /* Generate a new randomized boot ID, so that each boot-up of
587 * the container gets a new one */
589 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
590 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
594 r = sd_id128_randomize(&rnd);
596 log_error("Failed to generate random boot id: %s", strerror(-r));
600 snprintf(as_uuid, sizeof(as_uuid),
601 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
602 SD_ID128_FORMAT_VAL(rnd));
603 char_array_0(as_uuid);
605 r = write_string_file(from, as_uuid);
607 log_error("Failed to write boot id: %s", strerror(-r));
611 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
612 log_error("Failed to bind mount boot id: %m");
614 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
615 log_warning("Failed to make boot id read-only: %m");
621 static int copy_devnodes(const char *dest) {
623 static const char devnodes[] =
633 _cleanup_umask_ mode_t u;
639 NULSTR_FOREACH(d, devnodes) {
640 _cleanup_free_ char *from = NULL, *to = NULL;
643 from = strappend("/dev/", d);
644 to = strjoin(dest, "/dev/", d, NULL);
648 if (stat(from, &st) < 0) {
650 if (errno != ENOENT) {
651 log_error("Failed to stat %s: %m", from);
655 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
657 log_error("%s is not a char or block device, cannot copy", from);
660 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
662 log_error("mknod(%s) failed: %m", dest);
670 static int setup_ptmx(const char *dest) {
671 _cleanup_free_ char *p = NULL;
673 p = strappend(dest, "/dev/ptmx");
677 if (symlink("pts/ptmx", p) < 0) {
678 log_error("Failed to create /dev/ptmx symlink: %m");
685 static int setup_dev_console(const char *dest, const char *console) {
687 _cleanup_free_ char *to = NULL;
689 _cleanup_umask_ mode_t u;
696 if (stat(console, &st) < 0) {
697 log_error("Failed to stat %s: %m", console);
700 } else if (!S_ISCHR(st.st_mode)) {
701 log_error("/dev/console is not a char device");
705 r = chmod_and_chown(console, 0600, 0, 0);
707 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
711 if (asprintf(&to, "%s/dev/console", dest) < 0)
714 /* We need to bind mount the right tty to /dev/console since
715 * ptys can only exist on pts file systems. To have something
716 * to bind mount things on we create a device node first, that
717 * has the right major/minor (note that the major minor
718 * doesn't actually matter here, since we mount it over
721 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
722 log_error("mknod() for /dev/console failed: %m");
726 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
727 log_error("Bind mount for /dev/console failed: %m");
734 static int setup_kmsg(const char *dest, int kmsg_socket) {
735 _cleanup_free_ char *from = NULL, *to = NULL;
737 _cleanup_umask_ mode_t u;
739 struct cmsghdr cmsghdr;
740 uint8_t buf[CMSG_SPACE(sizeof(int))];
743 .msg_control = &control,
744 .msg_controllen = sizeof(control),
746 struct cmsghdr *cmsg;
749 assert(kmsg_socket >= 0);
753 /* We create the kmsg FIFO as /dev/kmsg, but immediately
754 * delete it after bind mounting it to /proc/kmsg. While FIFOs
755 * on the reading side behave very similar to /proc/kmsg,
756 * their writing side behaves differently from /dev/kmsg in
757 * that writing blocks when nothing is reading. In order to
758 * avoid any problems with containers deadlocking due to this
759 * we simply make /dev/kmsg unavailable to the container. */
760 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
761 asprintf(&to, "%s/proc/kmsg", dest) < 0)
764 if (mkfifo(from, 0600) < 0) {
765 log_error("mkfifo() for /dev/kmsg failed: %m");
769 r = chmod_and_chown(from, 0600, 0, 0);
771 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
775 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
776 log_error("Bind mount for /proc/kmsg failed: %m");
780 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
782 log_error("Failed to open fifo: %m");
786 cmsg = CMSG_FIRSTHDR(&mh);
787 cmsg->cmsg_level = SOL_SOCKET;
788 cmsg->cmsg_type = SCM_RIGHTS;
789 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
790 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
792 mh.msg_controllen = cmsg->cmsg_len;
794 /* Store away the fd in the socket, so that it stays open as
795 * long as we run the child */
796 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
797 close_nointr_nofail(fd);
800 log_error("Failed to send FIFO fd: %m");
804 /* And now make the FIFO unavailable as /dev/kmsg... */
809 static int setup_hostname(void) {
811 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
817 static int setup_journal(const char *directory) {
818 sd_id128_t machine_id, this_id;
819 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
823 p = strappend(directory, "/etc/machine-id");
827 r = read_one_line_file(p, &b);
828 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
831 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
836 if (isempty(id) && arg_link_journal == LINK_AUTO)
839 /* Verify validity */
840 r = sd_id128_from_string(id, &machine_id);
842 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
846 r = sd_id128_get_machine(&this_id);
848 log_error("Failed to retrieve machine ID: %s", strerror(-r));
852 if (sd_id128_equal(machine_id, this_id)) {
853 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
854 "Host and machine ids are equal (%s): refusing to link journals", id);
855 if (arg_link_journal == LINK_AUTO)
861 if (arg_link_journal == LINK_NO)
865 p = strappend("/var/log/journal/", id);
866 q = strjoin(directory, "/var/log/journal/", id, NULL);
870 if (path_is_mount_point(p, false) > 0) {
871 if (arg_link_journal != LINK_AUTO) {
872 log_error("%s: already a mount point, refusing to use for journal", p);
879 if (path_is_mount_point(q, false) > 0) {
880 if (arg_link_journal != LINK_AUTO) {
881 log_error("%s: already a mount point, refusing to use for journal", q);
888 r = readlink_and_make_absolute(p, &d);
890 if ((arg_link_journal == LINK_GUEST ||
891 arg_link_journal == LINK_AUTO) &&
894 r = mkdir_p(q, 0755);
896 log_warning("failed to create directory %s: %m", q);
901 log_error("Failed to remove symlink %s: %m", p);
904 } else if (r == -EINVAL) {
906 if (arg_link_journal == LINK_GUEST &&
909 if (errno == ENOTDIR) {
910 log_error("%s already exists and is neither a symlink nor a directory", p);
913 log_error("Failed to remove %s: %m", p);
917 } else if (r != -ENOENT) {
918 log_error("readlink(%s) failed: %m", p);
922 if (arg_link_journal == LINK_GUEST) {
924 if (symlink(q, p) < 0) {
925 log_error("Failed to symlink %s to %s: %m", q, p);
929 r = mkdir_p(q, 0755);
931 log_warning("failed to create directory %s: %m", q);
935 if (arg_link_journal == LINK_HOST) {
936 r = mkdir_p(p, 0755);
938 log_error("Failed to create %s: %m", p);
942 } else if (access(p, F_OK) < 0)
945 if (dir_is_empty(q) == 0) {
946 log_error("%s not empty.", q);
950 r = mkdir_p(q, 0755);
952 log_error("Failed to create %s: %m", q);
956 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
957 log_error("Failed to bind mount journal from host into guest: %m");
964 static int setup_kdbus(const char *dest, const char *path) {
970 p = strappenda(dest, "/dev/kdbus");
971 if (mkdir(p, 0755) < 0) {
972 log_error("Failed to create kdbus path: %m");
976 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
977 log_error("Failed to mount kdbus namespace path: %m");
984 static int drop_capabilities(void) {
985 return capability_bounding_set_drop(~arg_retain, false);
988 static int register_machine(pid_t pid) {
989 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
990 _cleanup_bus_unref_ sd_bus *bus = NULL;
993 r = sd_bus_open_system(&bus);
995 log_error("Failed to open system bus: %s", strerror(-r));
999 r = sd_bus_call_method(
1001 "org.freedesktop.machine1",
1002 "/org/freedesktop/machine1",
1003 "org.freedesktop.machine1.Manager",
1009 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1013 strempty(arg_directory),
1014 !isempty(arg_slice), "Slice", "s", arg_slice);
1016 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1023 static int terminate_machine(pid_t pid) {
1024 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1025 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1026 _cleanup_bus_unref_ sd_bus *bus = NULL;
1030 r = sd_bus_default_system(&bus);
1032 log_error("Failed to open system bus: %s", strerror(-r));
1036 r = sd_bus_call_method(
1038 "org.freedesktop.machine1",
1039 "/org/freedesktop/machine1",
1040 "org.freedesktop.machine1.Manager",
1047 /* Note that the machine might already have been
1048 * cleaned up automatically, hence don't consider it a
1049 * failure if we cannot get the machine object. */
1050 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1054 r = sd_bus_message_read(reply, "o", &path);
1056 return bus_log_parse_error(r);
1058 r = sd_bus_call_method(
1060 "org.freedesktop.machine1",
1062 "org.freedesktop.machine1.Machine",
1068 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1075 static bool audit_enabled(void) {
1078 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1080 close_nointr_nofail(fd);
1086 int main(int argc, char *argv[]) {
1088 int r = EXIT_FAILURE, k;
1089 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1091 const char *console = NULL;
1093 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1094 _cleanup_fdset_free_ FDSet *fds = NULL;
1095 _cleanup_free_ char *kdbus_namespace = NULL;
1098 log_parse_environment();
1101 k = parse_argv(argc, argv);
1109 if (arg_directory) {
1112 p = path_make_absolute_cwd(arg_directory);
1113 free(arg_directory);
1116 arg_directory = get_current_dir_name();
1118 if (!arg_directory) {
1119 log_error("Failed to determine path, please use -D.");
1123 path_kill_slashes(arg_directory);
1126 arg_machine = strdup(basename(arg_directory));
1132 hostname_cleanup(arg_machine, false);
1133 if (isempty(arg_machine)) {
1134 log_error("Failed to determine machine name automatically, please use -M.");
1139 if (geteuid() != 0) {
1140 log_error("Need to be root.");
1144 if (sd_booted() <= 0) {
1145 log_error("Not running on a systemd system.");
1149 if (arg_boot && audit_enabled()) {
1150 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1151 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1152 "line before using systemd-nspawn. Sleeping for 5s...\n");
1156 if (path_equal(arg_directory, "/")) {
1157 log_error("Spawning container on root directory not supported.");
1161 if (path_is_os_tree(arg_directory) <= 0) {
1162 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1167 n_fd_passed = sd_listen_fds(false);
1168 if (n_fd_passed > 0) {
1169 k = fdset_new_listen_fds(&fds, false);
1171 log_error("Failed to collect file descriptors: %s", strerror(-k));
1175 fdset_close_others(fds);
1178 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1180 log_error("Failed to acquire pseudo tty: %m");
1184 console = ptsname(master);
1186 log_error("Failed to determine tty name: %m");
1190 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1192 if (unlockpt(master) < 0) {
1193 log_error("Failed to unlock tty: %m");
1197 ns = strappenda("machine-", arg_machine);
1198 kdbus_fd = bus_kernel_create_namespace(ns, &kdbus_namespace);
1200 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1202 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1204 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1205 log_error("Failed to create kmsg socket pair: %m");
1209 sync_fd = eventfd(0, EFD_CLOEXEC);
1211 log_error("Failed to create event fd: %m");
1215 sd_notify(0, "READY=1");
1217 assert_se(sigemptyset(&mask) == 0);
1218 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1219 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1224 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1226 if (errno == EINVAL)
1227 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1229 log_error("clone() failed: %m");
1236 const char *home = NULL;
1237 uid_t uid = (uid_t) -1;
1238 gid_t gid = (gid_t) -1;
1240 const char *envp[] = {
1241 "PATH=" DEFAULT_PATH_SPLIT_USR,
1242 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1247 NULL, /* container_uuid */
1248 NULL, /* LISTEN_FDS */
1249 NULL, /* LISTEN_PID */
1255 envp[n_env] = strv_find_prefix(environ, "TERM=");
1259 close_nointr_nofail(master);
1262 close_nointr(STDIN_FILENO);
1263 close_nointr(STDOUT_FILENO);
1264 close_nointr(STDERR_FILENO);
1266 close_nointr_nofail(kmsg_socket_pair[0]);
1267 kmsg_socket_pair[0] = -1;
1269 reset_all_signal_handlers();
1271 assert_se(sigemptyset(&mask) == 0);
1272 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1274 k = open_terminal(console, O_RDWR);
1275 if (k != STDIN_FILENO) {
1277 close_nointr_nofail(k);
1281 log_error("Failed to open console: %s", strerror(-k));
1285 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1286 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1287 log_error("Failed to duplicate console: %m");
1292 log_error("setsid() failed: %m");
1296 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1297 log_error("PR_SET_PDEATHSIG failed: %m");
1301 /* Mark everything as slave, so that we still
1302 * receive mounts from the real root, but don't
1303 * propagate mounts to the real root. */
1304 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1305 log_error("MS_SLAVE|MS_REC failed: %m");
1309 /* Turn directory into bind mount */
1310 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1311 log_error("Failed to make bind mount.");
1316 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1317 log_error("Failed to make read-only.");
1321 if (mount_all(arg_directory) < 0)
1324 if (copy_devnodes(arg_directory) < 0)
1327 if (setup_ptmx(arg_directory) < 0)
1330 dev_setup(arg_directory);
1332 if (setup_dev_console(arg_directory, console) < 0)
1335 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1338 close_nointr_nofail(kmsg_socket_pair[1]);
1339 kmsg_socket_pair[1] = -1;
1341 if (setup_boot_id(arg_directory) < 0)
1344 if (setup_timezone(arg_directory) < 0)
1347 if (setup_resolv_conf(arg_directory) < 0)
1350 if (setup_journal(arg_directory) < 0)
1353 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1356 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1359 if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1362 if (chdir(arg_directory) < 0) {
1363 log_error("chdir(%s) failed: %m", arg_directory);
1367 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1368 log_error("mount(MS_MOVE) failed: %m");
1372 if (chroot(".") < 0) {
1373 log_error("chroot() failed: %m");
1377 if (chdir("/") < 0) {
1378 log_error("chdir() failed: %m");
1386 if (drop_capabilities() < 0) {
1387 log_error("drop_capabilities() failed: %m");
1393 /* Note that this resolves user names
1394 * inside the container, and hence
1395 * accesses the NSS modules from the
1396 * container and not the host. This is
1399 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1400 log_error("get_user_creds() failed: %m");
1404 if (mkdir_parents_label(home, 0775) < 0) {
1405 log_error("mkdir_parents_label() failed: %m");
1409 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1410 log_error("mkdir_safe_label() failed: %m");
1414 if (initgroups((const char*)arg_user, gid) < 0) {
1415 log_error("initgroups() failed: %m");
1419 if (setresgid(gid, gid, gid) < 0) {
1420 log_error("setregid() failed: %m");
1424 if (setresuid(uid, uid, uid) < 0) {
1425 log_error("setreuid() failed: %m");
1429 /* Reset everything fully to 0, just in case */
1431 if (setgroups(0, NULL) < 0) {
1432 log_error("setgroups() failed: %m");
1436 if (setresgid(0, 0, 0) < 0) {
1437 log_error("setregid() failed: %m");
1441 if (setresuid(0, 0, 0) < 0) {
1442 log_error("setreuid() failed: %m");
1447 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1448 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1449 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1454 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1455 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1461 if (fdset_size(fds) > 0) {
1462 k = fdset_cloexec(fds, false);
1464 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1468 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1469 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1477 eventfd_read(sync_fd, &x);
1478 close_nointr_nofail(sync_fd);
1481 if (!strv_isempty(arg_setenv)) {
1484 n = strv_env_merge(2, envp, arg_setenv);
1492 env_use = (char**) envp;
1498 /* Automatically search for the init system */
1500 l = 1 + argc - optind;
1501 a = newa(char*, l + 1);
1502 memcpy(a + 1, argv + optind, l * sizeof(char*));
1504 a[0] = (char*) "/usr/lib/systemd/systemd";
1505 execve(a[0], a, env_use);
1507 a[0] = (char*) "/lib/systemd/systemd";
1508 execve(a[0], a, env_use);
1510 a[0] = (char*) "/sbin/init";
1511 execve(a[0], a, env_use);
1512 } else if (argc > optind)
1513 execvpe(argv[optind], argv + optind, env_use);
1515 chdir(home ? home : "/root");
1516 execle("/bin/bash", "-bash", NULL, env_use);
1519 log_error("execv() failed: %m");
1522 _exit(EXIT_FAILURE);
1528 r = register_machine(pid);
1532 eventfd_write(sync_fd, 1);
1533 close_nointr_nofail(sync_fd);
1536 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1544 /* Kill if it is not dead yet anyway */
1545 terminate_machine(pid);
1547 /* Redundant, but better safe than sorry */
1550 k = wait_for_terminate(pid, &status);
1558 if (status.si_code == CLD_EXITED) {
1559 r = status.si_status;
1560 if (status.si_status != 0) {
1561 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1565 log_debug("Container %s exited successfully.", arg_machine);
1567 } else if (status.si_code == CLD_KILLED &&
1568 status.si_status == SIGINT) {
1569 log_info("Container %s has been shut down.", arg_machine);
1572 } else if (status.si_code == CLD_KILLED &&
1573 status.si_status == SIGHUP) {
1574 log_info("Container %s is being rebooted.", arg_machine);
1576 } else if (status.si_code == CLD_KILLED ||
1577 status.si_code == CLD_DUMPED) {
1579 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1583 log_error("Container %s failed due to unknown reason.", arg_machine);
1593 free(arg_directory);