1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include "sd-daemon.h"
53 #include "cgroup-util.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
62 #include "bus-error.h"
64 #include "bus-kernel.h"
70 typedef enum LinkJournal {
77 static char *arg_directory = NULL;
78 static char *arg_user = NULL;
79 static sd_id128_t arg_uuid = {};
80 static char *arg_machine = NULL;
81 static const char *arg_slice = NULL;
82 static bool arg_private_network = false;
83 static bool arg_read_only = false;
84 static bool arg_boot = false;
85 static LinkJournal arg_link_journal = LINK_AUTO;
86 static uint64_t arg_retain =
88 (1ULL << CAP_DAC_OVERRIDE) |
89 (1ULL << CAP_DAC_READ_SEARCH) |
90 (1ULL << CAP_FOWNER) |
91 (1ULL << CAP_FSETID) |
92 (1ULL << CAP_IPC_OWNER) |
95 (1ULL << CAP_LINUX_IMMUTABLE) |
96 (1ULL << CAP_NET_BIND_SERVICE) |
97 (1ULL << CAP_NET_BROADCAST) |
98 (1ULL << CAP_NET_RAW) |
99 (1ULL << CAP_SETGID) |
100 (1ULL << CAP_SETFCAP) |
101 (1ULL << CAP_SETPCAP) |
102 (1ULL << CAP_SETUID) |
103 (1ULL << CAP_SYS_ADMIN) |
104 (1ULL << CAP_SYS_CHROOT) |
105 (1ULL << CAP_SYS_NICE) |
106 (1ULL << CAP_SYS_PTRACE) |
107 (1ULL << CAP_SYS_TTY_CONFIG) |
108 (1ULL << CAP_SYS_RESOURCE) |
109 (1ULL << CAP_SYS_BOOT) |
110 (1ULL << CAP_AUDIT_WRITE) |
111 (1ULL << CAP_AUDIT_CONTROL);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
115 static int help(void) {
117 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
118 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
119 " -h --help Show this help\n"
120 " --version Print version string\n"
121 " -D --directory=NAME Root directory for the container\n"
122 " -b --boot Boot up full system (i.e. invoke init)\n"
123 " -u --user=USER Run the command under specified user or uid\n"
124 " --uuid=UUID Set a specific machine UUID for the container\n"
125 " -M --machine=NAME Set the machine name for the container\n"
126 " -S --slice=SLICE Place the container in the specified slice\n"
127 " --private-network Disable network in container\n"
128 " --read-only Mount the root directory read-only\n"
129 " --capability=CAP In addition to the default, retain specified\n"
131 " --drop-capability=CAP Drop the specified capability from the default set\n"
132 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
133 " -j Equivalent to --link-journal=host\n"
134 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
136 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137 program_invocation_short_name);
142 static int parse_argv(int argc, char *argv[]) {
156 static const struct option options[] = {
157 { "help", no_argument, NULL, 'h' },
158 { "version", no_argument, NULL, ARG_VERSION },
159 { "directory", required_argument, NULL, 'D' },
160 { "user", required_argument, NULL, 'u' },
161 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
162 { "boot", no_argument, NULL, 'b' },
163 { "uuid", required_argument, NULL, ARG_UUID },
164 { "read-only", no_argument, NULL, ARG_READ_ONLY },
165 { "capability", required_argument, NULL, ARG_CAPABILITY },
166 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
167 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
168 { "bind", required_argument, NULL, ARG_BIND },
169 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
170 { "machine", required_argument, NULL, 'M' },
171 { "slice", required_argument, NULL, 'S' },
180 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
188 puts(PACKAGE_STRING);
189 puts(SYSTEMD_FEATURES);
194 arg_directory = canonicalize_file_name(optarg);
195 if (!arg_directory) {
196 log_error("Invalid root directory: %m");
204 arg_user = strdup(optarg);
210 case ARG_PRIVATE_NETWORK:
211 arg_private_network = true;
219 r = sd_id128_from_string(optarg, &arg_uuid);
221 log_error("Invalid UUID: %s", optarg);
227 arg_slice = strdup(optarg);
234 if (!hostname_is_valid(optarg)) {
235 log_error("Invalid machine name: %s", optarg);
240 arg_machine = strdup(optarg);
247 arg_read_only = true;
251 case ARG_DROP_CAPABILITY: {
255 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
259 t = strndup(word, length);
263 if (cap_from_name(t, &cap) < 0) {
264 log_error("Failed to parse capability %s.", t);
271 if (c == ARG_CAPABILITY)
272 arg_retain |= 1ULL << (uint64_t) cap;
274 arg_retain &= ~(1ULL << (uint64_t) cap);
281 arg_link_journal = LINK_GUEST;
284 case ARG_LINK_JOURNAL:
285 if (streq(optarg, "auto"))
286 arg_link_journal = LINK_AUTO;
287 else if (streq(optarg, "no"))
288 arg_link_journal = LINK_NO;
289 else if (streq(optarg, "guest"))
290 arg_link_journal = LINK_GUEST;
291 else if (streq(optarg, "host"))
292 arg_link_journal = LINK_HOST;
294 log_error("Failed to parse link journal mode %s", optarg);
302 _cleanup_free_ char *a = NULL, *b = NULL;
306 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
308 e = strchr(optarg, ':');
310 a = strndup(optarg, e - optarg);
320 if (!path_is_absolute(a) || !path_is_absolute(b)) {
321 log_error("Invalid bind mount specification: %s", optarg);
325 r = strv_extend(x, a);
329 r = strv_extend(x, b);
340 assert_not_reached("Unhandled option");
347 static int mount_all(const char *dest) {
349 typedef struct MountPoint {
358 static const MountPoint mount_table[] = {
359 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
360 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
361 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
362 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
363 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
364 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
366 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
368 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
369 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
376 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377 _cleanup_free_ char *where = NULL;
380 where = strjoin(dest, "/", mount_table[k].where, NULL);
384 t = path_is_mount_point(where, true);
386 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
394 /* Skip this entry if it is not a remount. */
395 if (mount_table[k].what && t > 0)
398 mkdir_p(where, 0755);
400 if (mount(mount_table[k].what,
403 mount_table[k].flags,
404 mount_table[k].options) < 0 &&
405 mount_table[k].fatal) {
407 log_error("mount(%s) failed: %m", where);
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
420 STRV_FOREACH_PAIR(x, y, l) {
422 struct stat source_st, dest_st;
425 if (stat(*x, &source_st) < 0) {
426 log_error("failed to stat %s: %m", *x);
430 where = strappenda(dest, *y);
431 r = stat(where, &dest_st);
433 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
434 log_error("The file types of %s and %s do not match. Refusing bind mount",
438 } else if (errno == ENOENT) {
439 r = mkdir_parents_label(where, 0755);
441 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
445 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
448 /* Create the mount point, but be conservative -- refuse to create block
449 * and char devices. */
450 if (S_ISDIR(source_st.st_mode))
451 mkdir_label(where, 0755);
452 else if (S_ISFIFO(source_st.st_mode))
454 else if (S_ISSOCK(source_st.st_mode))
455 mknod(where, 0644 | S_IFSOCK, 0);
456 else if (S_ISREG(source_st.st_mode))
459 log_error("Refusing to create mountpoint for file: %s", *x);
463 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
464 log_error("mount(%s) failed: %m", where);
468 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
469 log_error("mount(%s) failed: %m", where);
477 static int setup_timezone(const char *dest) {
478 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
484 /* Fix the timezone, if possible */
485 r = readlink_malloc("/etc/localtime", &p);
487 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
491 z = path_startswith(p, "../usr/share/zoneinfo/");
493 z = path_startswith(p, "/usr/share/zoneinfo/");
495 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
499 where = strappend(dest, "/etc/localtime");
503 r = readlink_malloc(where, &q);
505 y = path_startswith(q, "../usr/share/zoneinfo/");
507 y = path_startswith(q, "/usr/share/zoneinfo/");
510 /* Already pointing to the right place? Then do nothing .. */
511 if (y && streq(y, z))
515 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
519 if (access(check, F_OK) < 0) {
520 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
524 what = strappend("../usr/share/zoneinfo/", z);
529 if (symlink(what, where) < 0) {
530 log_error("Failed to correct timezone of container: %m");
537 static int setup_resolv_conf(const char *dest) {
538 char _cleanup_free_ *where = NULL;
542 if (arg_private_network)
545 /* Fix resolv.conf, if possible */
546 where = strappend(dest, "/etc/resolv.conf");
550 /* We don't really care for the results of this really. If it
551 * fails, it fails, but meh... */
552 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
557 static int setup_boot_id(const char *dest) {
558 _cleanup_free_ char *from = NULL, *to = NULL;
565 /* Generate a new randomized boot ID, so that each boot-up of
566 * the container gets a new one */
568 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
569 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
573 r = sd_id128_randomize(&rnd);
575 log_error("Failed to generate random boot id: %s", strerror(-r));
579 snprintf(as_uuid, sizeof(as_uuid),
580 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
581 SD_ID128_FORMAT_VAL(rnd));
582 char_array_0(as_uuid);
584 r = write_string_file(from, as_uuid);
586 log_error("Failed to write boot id: %s", strerror(-r));
590 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
591 log_error("Failed to bind mount boot id: %m");
593 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
594 log_warning("Failed to make boot id read-only: %m");
600 static int copy_devnodes(const char *dest) {
602 static const char devnodes[] =
612 _cleanup_umask_ mode_t u;
618 NULSTR_FOREACH(d, devnodes) {
620 _cleanup_free_ char *from = NULL, *to = NULL;
622 asprintf(&from, "/dev/%s", d);
623 asprintf(&to, "%s/dev/%s", dest, d);
634 if (stat(from, &st) < 0) {
636 if (errno != ENOENT) {
637 log_error("Failed to stat %s: %m", from);
642 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
644 log_error("%s is not a char or block device, cannot copy", from);
648 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
650 log_error("mknod(%s) failed: %m", dest);
659 static int setup_ptmx(const char *dest) {
660 _cleanup_free_ char *p = NULL;
662 p = strappend(dest, "/dev/ptmx");
666 if (symlink("pts/ptmx", p) < 0) {
667 log_error("Failed to create /dev/ptmx symlink: %m");
674 static int setup_dev_console(const char *dest, const char *console) {
676 _cleanup_free_ char *to = NULL;
678 _cleanup_umask_ mode_t u;
685 if (stat(console, &st) < 0) {
686 log_error("Failed to stat %s: %m", console);
689 } else if (!S_ISCHR(st.st_mode)) {
690 log_error("/dev/console is not a char device");
694 r = chmod_and_chown(console, 0600, 0, 0);
696 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
700 if (asprintf(&to, "%s/dev/console", dest) < 0)
703 /* We need to bind mount the right tty to /dev/console since
704 * ptys can only exist on pts file systems. To have something
705 * to bind mount things on we create a device node first, that
706 * has the right major/minor (note that the major minor
707 * doesn't actually matter here, since we mount it over
710 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
711 log_error("mknod() for /dev/console failed: %m");
715 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
716 log_error("Bind mount for /dev/console failed: %m");
723 static int setup_kmsg(const char *dest, int kmsg_socket) {
724 _cleanup_free_ char *from = NULL, *to = NULL;
726 _cleanup_umask_ mode_t u;
728 struct cmsghdr cmsghdr;
729 uint8_t buf[CMSG_SPACE(sizeof(int))];
732 .msg_control = &control,
733 .msg_controllen = sizeof(control),
735 struct cmsghdr *cmsg;
738 assert(kmsg_socket >= 0);
742 /* We create the kmsg FIFO as /dev/kmsg, but immediately
743 * delete it after bind mounting it to /proc/kmsg. While FIFOs
744 * on the reading side behave very similar to /proc/kmsg,
745 * their writing side behaves differently from /dev/kmsg in
746 * that writing blocks when nothing is reading. In order to
747 * avoid any problems with containers deadlocking due to this
748 * we simply make /dev/kmsg unavailable to the container. */
749 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
750 asprintf(&to, "%s/proc/kmsg", dest) < 0)
753 if (mkfifo(from, 0600) < 0) {
754 log_error("mkfifo() for /dev/kmsg failed: %m");
758 r = chmod_and_chown(from, 0600, 0, 0);
760 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
764 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
765 log_error("Bind mount for /proc/kmsg failed: %m");
769 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
771 log_error("Failed to open fifo: %m");
775 cmsg = CMSG_FIRSTHDR(&mh);
776 cmsg->cmsg_level = SOL_SOCKET;
777 cmsg->cmsg_type = SCM_RIGHTS;
778 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
779 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
781 mh.msg_controllen = cmsg->cmsg_len;
783 /* Store away the fd in the socket, so that it stays open as
784 * long as we run the child */
785 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
786 close_nointr_nofail(fd);
789 log_error("Failed to send FIFO fd: %m");
793 /* And now make the FIFO unavailable as /dev/kmsg... */
798 static int setup_hostname(void) {
800 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
806 static int setup_journal(const char *directory) {
807 sd_id128_t machine_id, this_id;
808 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
812 p = strappend(directory, "/etc/machine-id");
816 r = read_one_line_file(p, &b);
817 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
820 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
825 if (isempty(id) && arg_link_journal == LINK_AUTO)
828 /* Verify validity */
829 r = sd_id128_from_string(id, &machine_id);
831 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
835 r = sd_id128_get_machine(&this_id);
837 log_error("Failed to retrieve machine ID: %s", strerror(-r));
841 if (sd_id128_equal(machine_id, this_id)) {
842 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
843 "Host and machine ids are equal (%s): refusing to link journals", id);
844 if (arg_link_journal == LINK_AUTO)
850 if (arg_link_journal == LINK_NO)
854 p = strappend("/var/log/journal/", id);
855 q = strjoin(directory, "/var/log/journal/", id, NULL);
859 if (path_is_mount_point(p, false) > 0) {
860 if (arg_link_journal != LINK_AUTO) {
861 log_error("%s: already a mount point, refusing to use for journal", p);
868 if (path_is_mount_point(q, false) > 0) {
869 if (arg_link_journal != LINK_AUTO) {
870 log_error("%s: already a mount point, refusing to use for journal", q);
877 r = readlink_and_make_absolute(p, &d);
879 if ((arg_link_journal == LINK_GUEST ||
880 arg_link_journal == LINK_AUTO) &&
883 r = mkdir_p(q, 0755);
885 log_warning("failed to create directory %s: %m", q);
890 log_error("Failed to remove symlink %s: %m", p);
893 } else if (r == -EINVAL) {
895 if (arg_link_journal == LINK_GUEST &&
898 if (errno == ENOTDIR) {
899 log_error("%s already exists and is neither a symlink nor a directory", p);
902 log_error("Failed to remove %s: %m", p);
906 } else if (r != -ENOENT) {
907 log_error("readlink(%s) failed: %m", p);
911 if (arg_link_journal == LINK_GUEST) {
913 if (symlink(q, p) < 0) {
914 log_error("Failed to symlink %s to %s: %m", q, p);
918 r = mkdir_p(q, 0755);
920 log_warning("failed to create directory %s: %m", q);
924 if (arg_link_journal == LINK_HOST) {
925 r = mkdir_p(p, 0755);
927 log_error("Failed to create %s: %m", p);
931 } else if (access(p, F_OK) < 0)
934 if (dir_is_empty(q) == 0) {
935 log_error("%s not empty.", q);
939 r = mkdir_p(q, 0755);
941 log_error("Failed to create %s: %m", q);
945 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
946 log_error("Failed to bind mount journal from host into guest: %m");
953 static int setup_kdbus(const char *dest, const char *path) {
959 p = strappenda(dest, "/dev/kdbus");
960 if (mkdir(p, 0755) < 0) {
961 log_error("Failed to create kdbus path: %m");
965 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
966 log_error("Failed to mount kdbus namespace path: %m");
973 static int drop_capabilities(void) {
974 return capability_bounding_set_drop(~arg_retain, false);
977 static int register_machine(void) {
978 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
979 _cleanup_bus_unref_ sd_bus *bus = NULL;
982 r = sd_bus_open_system(&bus);
984 log_error("Failed to open system bus: %s", strerror(-r));
988 r = sd_bus_call_method(
990 "org.freedesktop.machine1",
991 "/org/freedesktop/machine1",
992 "org.freedesktop.machine1.Manager",
998 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1002 strempty(arg_directory),
1003 !isempty(arg_slice), "Slice", "s", arg_slice);
1005 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1012 static int terminate_machine(pid_t pid) {
1013 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1014 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1015 _cleanup_bus_unref_ sd_bus *bus = NULL;
1019 r = sd_bus_default_system(&bus);
1021 log_error("Failed to open system bus: %s", strerror(-r));
1025 r = sd_bus_call_method(
1027 "org.freedesktop.machine1",
1028 "/org/freedesktop/machine1",
1029 "org.freedesktop.machine1.Manager",
1036 /* Note that the machine might already have been
1037 * cleaned up automatically, hence don't consider it a
1038 * failure if we cannot get the machine object. */
1039 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1043 r = sd_bus_message_read(reply, "o", &path);
1045 return bus_log_parse_error(r);
1047 r = sd_bus_call_method(
1049 "org.freedesktop.machine1",
1051 "org.freedesktop.machine1.Machine",
1057 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1064 static bool audit_enabled(void) {
1067 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1069 close_nointr_nofail(fd);
1075 int main(int argc, char *argv[]) {
1077 int r = EXIT_FAILURE, k;
1078 _cleanup_close_ int master = -1, kdbus_fd = -1;
1080 const char *console = NULL;
1082 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1083 _cleanup_fdset_free_ FDSet *fds = NULL;
1084 _cleanup_free_ char *kdbus_namespace = NULL;
1087 log_parse_environment();
1090 k = parse_argv(argc, argv);
1098 if (arg_directory) {
1101 p = path_make_absolute_cwd(arg_directory);
1102 free(arg_directory);
1105 arg_directory = get_current_dir_name();
1107 if (!arg_directory) {
1108 log_error("Failed to determine path, please use -D.");
1112 path_kill_slashes(arg_directory);
1115 arg_machine = strdup(basename(arg_directory));
1121 hostname_cleanup(arg_machine, false);
1122 if (isempty(arg_machine)) {
1123 log_error("Failed to determine machine name automatically, please use -M.");
1128 if (geteuid() != 0) {
1129 log_error("Need to be root.");
1133 if (sd_booted() <= 0) {
1134 log_error("Not running on a systemd system.");
1138 if (arg_boot && audit_enabled()) {
1139 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1140 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1141 "line before using systemd-nspawn. Sleeping for 5s...\n");
1145 if (path_equal(arg_directory, "/")) {
1146 log_error("Spawning container on root directory not supported.");
1150 if (path_is_os_tree(arg_directory) <= 0) {
1151 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1156 n_fd_passed = sd_listen_fds(false);
1157 if (n_fd_passed > 0) {
1158 k = fdset_new_listen_fds(&fds, false);
1160 log_error("Failed to collect file descriptors: %s", strerror(-k));
1164 fdset_close_others(fds);
1167 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1169 log_error("Failed to acquire pseudo tty: %m");
1173 console = ptsname(master);
1175 log_error("Failed to determine tty name: %m");
1179 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1181 if (unlockpt(master) < 0) {
1182 log_error("Failed to unlock tty: %m");
1186 ns = strappenda("machine-", arg_machine);
1187 kdbus_fd = bus_kernel_create_namespace(ns, &kdbus_namespace);
1189 log_debug("Failed to create kdbus namespace: %s", strerror(-r));
1191 log_debug("Successfully created kdbus namespace as %s", kdbus_namespace);
1193 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1194 log_error("Failed to create kmsg socket pair.");
1198 sd_notify(0, "READY=1");
1200 assert_se(sigemptyset(&mask) == 0);
1201 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1202 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1207 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1209 if (errno == EINVAL)
1210 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1212 log_error("clone() failed: %m");
1219 const char *home = NULL;
1220 uid_t uid = (uid_t) -1;
1221 gid_t gid = (gid_t) -1;
1223 const char *envp[] = {
1224 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1225 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1230 NULL, /* container_uuid */
1231 NULL, /* LISTEN_FDS */
1232 NULL, /* LISTEN_PID */
1236 envp[n_env] = strv_find_prefix(environ, "TERM=");
1240 close_nointr_nofail(master);
1243 close_nointr(STDIN_FILENO);
1244 close_nointr(STDOUT_FILENO);
1245 close_nointr(STDERR_FILENO);
1247 close_nointr_nofail(kmsg_socket_pair[0]);
1248 kmsg_socket_pair[0] = -1;
1250 reset_all_signal_handlers();
1252 assert_se(sigemptyset(&mask) == 0);
1253 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1255 k = open_terminal(console, O_RDWR);
1256 if (k != STDIN_FILENO) {
1258 close_nointr_nofail(k);
1262 log_error("Failed to open console: %s", strerror(-k));
1266 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1267 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1268 log_error("Failed to duplicate console: %m");
1273 log_error("setsid() failed: %m");
1277 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1278 log_error("PR_SET_PDEATHSIG failed: %m");
1282 r = register_machine();
1286 /* Mark everything as slave, so that we still
1287 * receive mounts from the real root, but don't
1288 * propagate mounts to the real root. */
1289 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1290 log_error("MS_SLAVE|MS_REC failed: %m");
1294 /* Turn directory into bind mount */
1295 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1296 log_error("Failed to make bind mount.");
1301 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1302 log_error("Failed to make read-only.");
1306 if (mount_all(arg_directory) < 0)
1309 if (copy_devnodes(arg_directory) < 0)
1312 if (setup_ptmx(arg_directory) < 0)
1315 dev_setup(arg_directory);
1317 if (setup_dev_console(arg_directory, console) < 0)
1320 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1323 close_nointr_nofail(kmsg_socket_pair[1]);
1324 kmsg_socket_pair[1] = -1;
1326 if (setup_boot_id(arg_directory) < 0)
1329 if (setup_timezone(arg_directory) < 0)
1332 if (setup_resolv_conf(arg_directory) < 0)
1335 if (setup_journal(arg_directory) < 0)
1338 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1341 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1344 if (setup_kdbus(arg_directory, kdbus_namespace) < 0)
1347 if (chdir(arg_directory) < 0) {
1348 log_error("chdir(%s) failed: %m", arg_directory);
1352 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1353 log_error("mount(MS_MOVE) failed: %m");
1357 if (chroot(".") < 0) {
1358 log_error("chroot() failed: %m");
1362 if (chdir("/") < 0) {
1363 log_error("chdir() failed: %m");
1371 if (drop_capabilities() < 0) {
1372 log_error("drop_capabilities() failed: %m");
1378 /* Note that this resolves user names
1379 * inside the container, and hence
1380 * accesses the NSS modules from the
1381 * container and not the host. This is
1384 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1385 log_error("get_user_creds() failed: %m");
1389 if (mkdir_parents_label(home, 0775) < 0) {
1390 log_error("mkdir_parents_label() failed: %m");
1394 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1395 log_error("mkdir_safe_label() failed: %m");
1399 if (initgroups((const char*)arg_user, gid) < 0) {
1400 log_error("initgroups() failed: %m");
1404 if (setresgid(gid, gid, gid) < 0) {
1405 log_error("setregid() failed: %m");
1409 if (setresuid(uid, uid, uid) < 0) {
1410 log_error("setreuid() failed: %m");
1414 /* Reset everything fully to 0, just in case */
1416 if (setgroups(0, NULL) < 0) {
1417 log_error("setgroups() failed: %m");
1421 if (setresgid(0, 0, 0) < 0) {
1422 log_error("setregid() failed: %m");
1426 if (setresuid(0, 0, 0) < 0) {
1427 log_error("setreuid() failed: %m");
1432 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1433 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1434 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1439 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1440 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1446 if (fdset_size(fds) > 0) {
1447 k = fdset_cloexec(fds, false);
1449 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1453 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1454 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1466 /* Automatically search for the init system */
1468 l = 1 + argc - optind;
1469 a = newa(char*, l + 1);
1470 memcpy(a + 1, argv + optind, l * sizeof(char*));
1472 a[0] = (char*) "/usr/lib/systemd/systemd";
1473 execve(a[0], a, (char**) envp);
1475 a[0] = (char*) "/lib/systemd/systemd";
1476 execve(a[0], a, (char**) envp);
1478 a[0] = (char*) "/sbin/init";
1479 execve(a[0], a, (char**) envp);
1480 } else if (argc > optind)
1481 execvpe(argv[optind], argv + optind, (char**) envp);
1483 chdir(home ? home : "/root");
1484 execle("/bin/bash", "-bash", NULL, (char**) envp);
1487 log_error("execv() failed: %m");
1490 _exit(EXIT_FAILURE);
1496 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1504 /* Kill if it is not dead yet anyway */
1505 terminate_machine(pid);
1507 /* Redundant, but better safe than sorry */
1510 k = wait_for_terminate(pid, &status);
1518 if (status.si_code == CLD_EXITED) {
1519 r = status.si_status;
1520 if (status.si_status != 0) {
1521 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1525 log_debug("Container %s exited successfully.", arg_machine);
1527 } else if (status.si_code == CLD_KILLED &&
1528 status.si_status == SIGINT) {
1529 log_info("Container %s has been shut down.", arg_machine);
1532 } else if (status.si_code == CLD_KILLED &&
1533 status.si_status == SIGHUP) {
1534 log_info("Container %s is being rebooted.", arg_machine);
1536 } else if (status.si_code == CLD_KILLED ||
1537 status.si_code == CLD_DUMPED) {
1539 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1543 log_error("Container %s failed due to unknown reason.", arg_machine);
1553 free(arg_directory);