1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <selinux/selinux.h>
48 #include "sd-daemon.h"
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
66 #include "bus-error.h"
68 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
112 (1ULL << CAP_SYS_RESOURCE) |
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
115 (1ULL << CAP_AUDIT_CONTROL) |
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
123 static int help(void) {
125 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
126 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
127 " -h --help Show this help\n"
128 " --version Print version string\n"
129 " -D --directory=NAME Root directory for the container\n"
130 " -b --boot Boot up full system (i.e. invoke init)\n"
131 " -u --user=USER Run the command under specified user or uid\n"
132 " --uuid=UUID Set a specific machine UUID for the container\n"
133 " -M --machine=NAME Set the machine name for the container\n"
134 " -S --slice=SLICE Place the container in the specified slice\n"
135 " -Z --selinux-context=SECLABEL\n"
136 " Set the SELinux security context to be used by\n"
137 " processes in the container\n"
138 " -L --selinux-apifs-context=SECLABEL\n"
139 " Set the SELinux security context to be used by\n"
140 " API/tmpfs file systems in the container\n"
141 " --private-network Disable network in container\n"
142 " --share-system Share system namespaces with host\n"
143 " --read-only Mount the root directory read-only\n"
144 " --capability=CAP In addition to the default, retain specified\n"
146 " --drop-capability=CAP Drop the specified capability from the default set\n"
147 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
148 " -j Equivalent to --link-journal=host\n"
149 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
151 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
152 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
153 " -q --quiet Do not show status information\n",
154 program_invocation_short_name);
159 static int parse_argv(int argc, char *argv[]) {
175 static const struct option options[] = {
176 { "help", no_argument, NULL, 'h' },
177 { "version", no_argument, NULL, ARG_VERSION },
178 { "directory", required_argument, NULL, 'D' },
179 { "user", required_argument, NULL, 'u' },
180 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
181 { "boot", no_argument, NULL, 'b' },
182 { "uuid", required_argument, NULL, ARG_UUID },
183 { "read-only", no_argument, NULL, ARG_READ_ONLY },
184 { "capability", required_argument, NULL, ARG_CAPABILITY },
185 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
186 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
187 { "bind", required_argument, NULL, ARG_BIND },
188 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
189 { "machine", required_argument, NULL, 'M' },
190 { "slice", required_argument, NULL, 'S' },
191 { "setenv", required_argument, NULL, ARG_SETENV },
192 { "selinux-context", required_argument, NULL, 'Z' },
193 { "selinux-apifs-context", required_argument, NULL, 'L' },
194 { "quiet", no_argument, NULL, 'q' },
195 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
204 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
212 puts(PACKAGE_STRING);
213 puts(SYSTEMD_FEATURES);
218 arg_directory = canonicalize_file_name(optarg);
219 if (!arg_directory) {
220 log_error("Invalid root directory: %m");
228 arg_user = strdup(optarg);
234 case ARG_PRIVATE_NETWORK:
235 arg_private_network = true;
243 r = sd_id128_from_string(optarg, &arg_uuid);
245 log_error("Invalid UUID: %s", optarg);
251 arg_slice = strdup(optarg);
258 if (!hostname_is_valid(optarg)) {
259 log_error("Invalid machine name: %s", optarg);
264 arg_machine = strdup(optarg);
271 arg_selinux_context = optarg;
275 arg_selinux_apifs_context = optarg;
279 arg_read_only = true;
283 case ARG_DROP_CAPABILITY: {
287 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
291 t = strndup(word, length);
295 if (cap_from_name(t, &cap) < 0) {
296 log_error("Failed to parse capability %s.", t);
303 if (c == ARG_CAPABILITY)
304 arg_retain |= 1ULL << (uint64_t) cap;
306 arg_retain &= ~(1ULL << (uint64_t) cap);
313 arg_link_journal = LINK_GUEST;
316 case ARG_LINK_JOURNAL:
317 if (streq(optarg, "auto"))
318 arg_link_journal = LINK_AUTO;
319 else if (streq(optarg, "no"))
320 arg_link_journal = LINK_NO;
321 else if (streq(optarg, "guest"))
322 arg_link_journal = LINK_GUEST;
323 else if (streq(optarg, "host"))
324 arg_link_journal = LINK_HOST;
326 log_error("Failed to parse link journal mode %s", optarg);
334 _cleanup_free_ char *a = NULL, *b = NULL;
338 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
340 e = strchr(optarg, ':');
342 a = strndup(optarg, e - optarg);
352 if (!path_is_absolute(a) || !path_is_absolute(b)) {
353 log_error("Invalid bind mount specification: %s", optarg);
357 r = strv_extend(x, a);
361 r = strv_extend(x, b);
371 if (!env_assignment_is_valid(optarg)) {
372 log_error("Environment variable assignment '%s' is not valid.", optarg);
376 n = strv_env_set(arg_setenv, optarg);
380 strv_free(arg_setenv);
389 case ARG_SHARE_SYSTEM:
390 arg_share_system = true;
397 assert_not_reached("Unhandled option");
404 static int mount_all(const char *dest) {
406 typedef struct MountPoint {
415 static const MountPoint mount_table[] = {
416 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
417 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
418 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
419 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
420 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
421 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
422 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
423 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
425 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
426 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
433 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
434 _cleanup_free_ char *where = NULL;
436 _cleanup_free_ char *options = NULL;
441 where = strjoin(dest, "/", mount_table[k].where, NULL);
445 t = path_is_mount_point(where, true);
447 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
455 /* Skip this entry if it is not a remount. */
456 if (mount_table[k].what && t > 0)
459 mkdir_p(where, 0755);
462 if (arg_selinux_apifs_context &&
463 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
464 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
471 o = mount_table[k].options;
474 if (mount(mount_table[k].what,
477 mount_table[k].flags,
479 mount_table[k].fatal) {
481 log_error("mount(%s) failed: %m", where);
491 static int mount_binds(const char *dest, char **l, unsigned long flags) {
494 STRV_FOREACH_PAIR(x, y, l) {
496 struct stat source_st, dest_st;
499 if (stat(*x, &source_st) < 0) {
500 log_error("failed to stat %s: %m", *x);
504 where = strappenda(dest, *y);
505 r = stat(where, &dest_st);
507 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
508 log_error("The file types of %s and %s do not match. Refusing bind mount",
512 } else if (errno == ENOENT) {
513 r = mkdir_parents_label(where, 0755);
515 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
519 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
522 /* Create the mount point, but be conservative -- refuse to create block
523 * and char devices. */
524 if (S_ISDIR(source_st.st_mode))
525 mkdir_label(where, 0755);
526 else if (S_ISFIFO(source_st.st_mode))
528 else if (S_ISSOCK(source_st.st_mode))
529 mknod(where, 0644 | S_IFSOCK, 0);
530 else if (S_ISREG(source_st.st_mode))
533 log_error("Refusing to create mountpoint for file: %s", *x);
537 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
538 log_error("mount(%s) failed: %m", where);
542 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
543 log_error("mount(%s) failed: %m", where);
551 static int setup_timezone(const char *dest) {
552 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
558 /* Fix the timezone, if possible */
559 r = readlink_malloc("/etc/localtime", &p);
561 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
565 z = path_startswith(p, "../usr/share/zoneinfo/");
567 z = path_startswith(p, "/usr/share/zoneinfo/");
569 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
573 where = strappend(dest, "/etc/localtime");
577 r = readlink_malloc(where, &q);
579 y = path_startswith(q, "../usr/share/zoneinfo/");
581 y = path_startswith(q, "/usr/share/zoneinfo/");
584 /* Already pointing to the right place? Then do nothing .. */
585 if (y && streq(y, z))
589 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
593 if (access(check, F_OK) < 0) {
594 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
598 what = strappend("../usr/share/zoneinfo/", z);
603 if (symlink(what, where) < 0) {
604 log_error("Failed to correct timezone of container: %m");
611 static int setup_resolv_conf(const char *dest) {
612 char _cleanup_free_ *where = NULL;
616 if (arg_private_network)
619 /* Fix resolv.conf, if possible */
620 where = strappend(dest, "/etc/resolv.conf");
624 /* We don't really care for the results of this really. If it
625 * fails, it fails, but meh... */
626 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
631 static int setup_boot_id(const char *dest) {
632 _cleanup_free_ char *from = NULL, *to = NULL;
639 /* Generate a new randomized boot ID, so that each boot-up of
640 * the container gets a new one */
642 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
643 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
647 r = sd_id128_randomize(&rnd);
649 log_error("Failed to generate random boot id: %s", strerror(-r));
653 snprintf(as_uuid, sizeof(as_uuid),
654 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
655 SD_ID128_FORMAT_VAL(rnd));
656 char_array_0(as_uuid);
658 r = write_string_file(from, as_uuid);
660 log_error("Failed to write boot id: %s", strerror(-r));
664 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
665 log_error("Failed to bind mount boot id: %m");
667 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
668 log_warning("Failed to make boot id read-only: %m");
674 static int copy_devnodes(const char *dest) {
676 static const char devnodes[] =
686 _cleanup_umask_ mode_t u;
692 NULSTR_FOREACH(d, devnodes) {
693 _cleanup_free_ char *from = NULL, *to = NULL;
696 from = strappend("/dev/", d);
697 to = strjoin(dest, "/dev/", d, NULL);
701 if (stat(from, &st) < 0) {
703 if (errno != ENOENT) {
704 log_error("Failed to stat %s: %m", from);
708 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
710 log_error("%s is not a char or block device, cannot copy", from);
713 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
715 log_error("mknod(%s) failed: %m", dest);
723 static int setup_ptmx(const char *dest) {
724 _cleanup_free_ char *p = NULL;
726 p = strappend(dest, "/dev/ptmx");
730 if (symlink("pts/ptmx", p) < 0) {
731 log_error("Failed to create /dev/ptmx symlink: %m");
738 static int setup_dev_console(const char *dest, const char *console) {
740 _cleanup_free_ char *to = NULL;
742 _cleanup_umask_ mode_t u;
749 if (stat(console, &st) < 0) {
750 log_error("Failed to stat %s: %m", console);
753 } else if (!S_ISCHR(st.st_mode)) {
754 log_error("/dev/console is not a char device");
758 r = chmod_and_chown(console, 0600, 0, 0);
760 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
764 if (asprintf(&to, "%s/dev/console", dest) < 0)
767 /* We need to bind mount the right tty to /dev/console since
768 * ptys can only exist on pts file systems. To have something
769 * to bind mount things on we create a device node first, that
770 * has the right major/minor (note that the major minor
771 * doesn't actually matter here, since we mount it over
774 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
775 log_error("mknod() for /dev/console failed: %m");
779 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
780 log_error("Bind mount for /dev/console failed: %m");
787 static int setup_kmsg(const char *dest, int kmsg_socket) {
788 _cleanup_free_ char *from = NULL, *to = NULL;
790 _cleanup_umask_ mode_t u;
792 struct cmsghdr cmsghdr;
793 uint8_t buf[CMSG_SPACE(sizeof(int))];
796 .msg_control = &control,
797 .msg_controllen = sizeof(control),
799 struct cmsghdr *cmsg;
802 assert(kmsg_socket >= 0);
806 /* We create the kmsg FIFO as /dev/kmsg, but immediately
807 * delete it after bind mounting it to /proc/kmsg. While FIFOs
808 * on the reading side behave very similar to /proc/kmsg,
809 * their writing side behaves differently from /dev/kmsg in
810 * that writing blocks when nothing is reading. In order to
811 * avoid any problems with containers deadlocking due to this
812 * we simply make /dev/kmsg unavailable to the container. */
813 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
814 asprintf(&to, "%s/proc/kmsg", dest) < 0)
817 if (mkfifo(from, 0600) < 0) {
818 log_error("mkfifo() for /dev/kmsg failed: %m");
822 r = chmod_and_chown(from, 0600, 0, 0);
824 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
828 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
829 log_error("Bind mount for /proc/kmsg failed: %m");
833 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
835 log_error("Failed to open fifo: %m");
839 cmsg = CMSG_FIRSTHDR(&mh);
840 cmsg->cmsg_level = SOL_SOCKET;
841 cmsg->cmsg_type = SCM_RIGHTS;
842 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
843 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
845 mh.msg_controllen = cmsg->cmsg_len;
847 /* Store away the fd in the socket, so that it stays open as
848 * long as we run the child */
849 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
850 close_nointr_nofail(fd);
853 log_error("Failed to send FIFO fd: %m");
857 /* And now make the FIFO unavailable as /dev/kmsg... */
862 static int setup_hostname(void) {
864 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
870 static int setup_journal(const char *directory) {
871 sd_id128_t machine_id, this_id;
872 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
876 p = strappend(directory, "/etc/machine-id");
880 r = read_one_line_file(p, &b);
881 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
884 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
889 if (isempty(id) && arg_link_journal == LINK_AUTO)
892 /* Verify validity */
893 r = sd_id128_from_string(id, &machine_id);
895 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
899 r = sd_id128_get_machine(&this_id);
901 log_error("Failed to retrieve machine ID: %s", strerror(-r));
905 if (sd_id128_equal(machine_id, this_id)) {
906 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
907 "Host and machine ids are equal (%s): refusing to link journals", id);
908 if (arg_link_journal == LINK_AUTO)
914 if (arg_link_journal == LINK_NO)
918 p = strappend("/var/log/journal/", id);
919 q = strjoin(directory, "/var/log/journal/", id, NULL);
923 if (path_is_mount_point(p, false) > 0) {
924 if (arg_link_journal != LINK_AUTO) {
925 log_error("%s: already a mount point, refusing to use for journal", p);
932 if (path_is_mount_point(q, false) > 0) {
933 if (arg_link_journal != LINK_AUTO) {
934 log_error("%s: already a mount point, refusing to use for journal", q);
941 r = readlink_and_make_absolute(p, &d);
943 if ((arg_link_journal == LINK_GUEST ||
944 arg_link_journal == LINK_AUTO) &&
947 r = mkdir_p(q, 0755);
949 log_warning("failed to create directory %s: %m", q);
954 log_error("Failed to remove symlink %s: %m", p);
957 } else if (r == -EINVAL) {
959 if (arg_link_journal == LINK_GUEST &&
962 if (errno == ENOTDIR) {
963 log_error("%s already exists and is neither a symlink nor a directory", p);
966 log_error("Failed to remove %s: %m", p);
970 } else if (r != -ENOENT) {
971 log_error("readlink(%s) failed: %m", p);
975 if (arg_link_journal == LINK_GUEST) {
977 if (symlink(q, p) < 0) {
978 log_error("Failed to symlink %s to %s: %m", q, p);
982 r = mkdir_p(q, 0755);
984 log_warning("failed to create directory %s: %m", q);
988 if (arg_link_journal == LINK_HOST) {
989 r = mkdir_p(p, 0755);
991 log_error("Failed to create %s: %m", p);
995 } else if (access(p, F_OK) < 0)
998 if (dir_is_empty(q) == 0) {
999 log_error("%s not empty.", q);
1003 r = mkdir_p(q, 0755);
1005 log_error("Failed to create %s: %m", q);
1009 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1010 log_error("Failed to bind mount journal from host into guest: %m");
1017 static int setup_kdbus(const char *dest, const char *path) {
1023 p = strappenda(dest, "/dev/kdbus");
1024 if (mkdir(p, 0755) < 0) {
1025 log_error("Failed to create kdbus path: %m");
1029 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1030 log_error("Failed to mount kdbus domain path: %m");
1037 static int drop_capabilities(void) {
1038 return capability_bounding_set_drop(~arg_retain, false);
1041 static int register_machine(pid_t pid) {
1042 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1043 _cleanup_bus_unref_ sd_bus *bus = NULL;
1046 r = sd_bus_default_system(&bus);
1048 log_error("Failed to open system bus: %s", strerror(-r));
1052 r = sd_bus_call_method(
1054 "org.freedesktop.machine1",
1055 "/org/freedesktop/machine1",
1056 "org.freedesktop.machine1.Manager",
1062 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1066 strempty(arg_directory),
1067 !isempty(arg_slice), "Slice", "s", arg_slice);
1069 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1076 static int terminate_machine(pid_t pid) {
1077 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1078 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1079 _cleanup_bus_unref_ sd_bus *bus = NULL;
1083 r = sd_bus_default_system(&bus);
1085 log_error("Failed to open system bus: %s", strerror(-r));
1089 r = sd_bus_call_method(
1091 "org.freedesktop.machine1",
1092 "/org/freedesktop/machine1",
1093 "org.freedesktop.machine1.Manager",
1100 /* Note that the machine might already have been
1101 * cleaned up automatically, hence don't consider it a
1102 * failure if we cannot get the machine object. */
1103 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1107 r = sd_bus_message_read(reply, "o", &path);
1109 return bus_log_parse_error(r);
1111 r = sd_bus_call_method(
1113 "org.freedesktop.machine1",
1115 "org.freedesktop.machine1.Machine",
1121 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1128 static bool audit_enabled(void) {
1131 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1133 close_nointr_nofail(fd);
1139 int main(int argc, char *argv[]) {
1141 int r = EXIT_FAILURE, k;
1142 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1144 const char *console = NULL;
1146 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1147 _cleanup_fdset_free_ FDSet *fds = NULL;
1148 _cleanup_free_ char *kdbus_domain = NULL;
1151 log_parse_environment();
1154 k = parse_argv(argc, argv);
1162 if (arg_directory) {
1165 p = path_make_absolute_cwd(arg_directory);
1166 free(arg_directory);
1169 arg_directory = get_current_dir_name();
1171 if (!arg_directory) {
1172 log_error("Failed to determine path, please use -D.");
1176 path_kill_slashes(arg_directory);
1179 arg_machine = strdup(basename(arg_directory));
1185 hostname_cleanup(arg_machine, false);
1186 if (isempty(arg_machine)) {
1187 log_error("Failed to determine machine name automatically, please use -M.");
1192 if (geteuid() != 0) {
1193 log_error("Need to be root.");
1197 if (sd_booted() <= 0) {
1198 log_error("Not running on a systemd system.");
1202 if (arg_boot && audit_enabled()) {
1203 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1204 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1205 "line before using systemd-nspawn. Sleeping for 5s...\n");
1209 if (path_equal(arg_directory, "/")) {
1210 log_error("Spawning container on root directory not supported.");
1214 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1215 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1220 n_fd_passed = sd_listen_fds(false);
1221 if (n_fd_passed > 0) {
1222 k = fdset_new_listen_fds(&fds, false);
1224 log_error("Failed to collect file descriptors: %s", strerror(-k));
1228 fdset_close_others(fds);
1231 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1233 log_error("Failed to acquire pseudo tty: %m");
1237 console = ptsname(master);
1239 log_error("Failed to determine tty name: %m");
1244 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1246 if (unlockpt(master) < 0) {
1247 log_error("Failed to unlock tty: %m");
1251 ns = strappenda("machine-", arg_machine);
1252 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1254 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1256 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1258 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1259 log_error("Failed to create kmsg socket pair: %m");
1263 sd_notify(0, "READY=1");
1265 assert_se(sigemptyset(&mask) == 0);
1266 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1267 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1272 sync_fd = eventfd(0, EFD_CLOEXEC);
1274 log_error("Failed to create event fd: %m");
1278 pid = syscall(__NR_clone,
1279 SIGCHLD|CLONE_NEWNS|
1280 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1281 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1283 if (errno == EINVAL)
1284 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1286 log_error("clone() failed: %m");
1293 const char *home = NULL;
1294 uid_t uid = (uid_t) -1;
1295 gid_t gid = (gid_t) -1;
1297 const char *envp[] = {
1298 "PATH=" DEFAULT_PATH_SPLIT_USR,
1299 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1304 NULL, /* container_uuid */
1305 NULL, /* LISTEN_FDS */
1306 NULL, /* LISTEN_PID */
1312 envp[n_env] = strv_find_prefix(environ, "TERM=");
1316 close_nointr_nofail(master);
1319 close_nointr(STDIN_FILENO);
1320 close_nointr(STDOUT_FILENO);
1321 close_nointr(STDERR_FILENO);
1323 close_nointr_nofail(kmsg_socket_pair[0]);
1324 kmsg_socket_pair[0] = -1;
1326 reset_all_signal_handlers();
1328 assert_se(sigemptyset(&mask) == 0);
1329 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1331 k = open_terminal(console, O_RDWR);
1332 if (k != STDIN_FILENO) {
1334 close_nointr_nofail(k);
1338 log_error("Failed to open console: %s", strerror(-k));
1342 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1343 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1344 log_error("Failed to duplicate console: %m");
1349 log_error("setsid() failed: %m");
1353 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1354 log_error("PR_SET_PDEATHSIG failed: %m");
1358 /* Mark everything as slave, so that we still
1359 * receive mounts from the real root, but don't
1360 * propagate mounts to the real root. */
1361 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1362 log_error("MS_SLAVE|MS_REC failed: %m");
1366 /* Turn directory into bind mount */
1367 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1368 log_error("Failed to make bind mount.");
1373 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1374 log_error("Failed to make read-only.");
1378 if (mount_all(arg_directory) < 0)
1381 if (copy_devnodes(arg_directory) < 0)
1384 if (setup_ptmx(arg_directory) < 0)
1387 dev_setup(arg_directory);
1389 if (setup_dev_console(arg_directory, console) < 0)
1392 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1395 close_nointr_nofail(kmsg_socket_pair[1]);
1396 kmsg_socket_pair[1] = -1;
1398 if (setup_boot_id(arg_directory) < 0)
1401 if (setup_timezone(arg_directory) < 0)
1404 if (setup_resolv_conf(arg_directory) < 0)
1407 if (setup_journal(arg_directory) < 0)
1410 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1413 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1416 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1419 if (chdir(arg_directory) < 0) {
1420 log_error("chdir(%s) failed: %m", arg_directory);
1424 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1425 log_error("mount(MS_MOVE) failed: %m");
1429 if (chroot(".") < 0) {
1430 log_error("chroot() failed: %m");
1434 if (chdir("/") < 0) {
1435 log_error("chdir() failed: %m");
1443 if (drop_capabilities() < 0) {
1444 log_error("drop_capabilities() failed: %m");
1450 /* Note that this resolves user names
1451 * inside the container, and hence
1452 * accesses the NSS modules from the
1453 * container and not the host. This is
1456 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1457 log_error("get_user_creds() failed: %m");
1461 if (mkdir_parents_label(home, 0775) < 0) {
1462 log_error("mkdir_parents_label() failed: %m");
1466 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1467 log_error("mkdir_safe_label() failed: %m");
1471 if (initgroups((const char*)arg_user, gid) < 0) {
1472 log_error("initgroups() failed: %m");
1476 if (setresgid(gid, gid, gid) < 0) {
1477 log_error("setregid() failed: %m");
1481 if (setresuid(uid, uid, uid) < 0) {
1482 log_error("setreuid() failed: %m");
1486 /* Reset everything fully to 0, just in case */
1488 if (setgroups(0, NULL) < 0) {
1489 log_error("setgroups() failed: %m");
1493 if (setresgid(0, 0, 0) < 0) {
1494 log_error("setregid() failed: %m");
1498 if (setresuid(0, 0, 0) < 0) {
1499 log_error("setreuid() failed: %m");
1504 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1505 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1506 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1511 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1512 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1518 if (fdset_size(fds) > 0) {
1519 k = fdset_cloexec(fds, false);
1521 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1525 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1526 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1534 eventfd_read(sync_fd, &x);
1535 close_nointr_nofail(sync_fd);
1538 if (!strv_isempty(arg_setenv)) {
1541 n = strv_env_merge(2, envp, arg_setenv);
1549 env_use = (char**) envp;
1552 if (arg_selinux_context)
1553 if (setexeccon(arg_selinux_context) < 0)
1554 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1560 /* Automatically search for the init system */
1562 l = 1 + argc - optind;
1563 a = newa(char*, l + 1);
1564 memcpy(a + 1, argv + optind, l * sizeof(char*));
1566 a[0] = (char*) "/usr/lib/systemd/systemd";
1567 execve(a[0], a, env_use);
1569 a[0] = (char*) "/lib/systemd/systemd";
1570 execve(a[0], a, env_use);
1572 a[0] = (char*) "/sbin/init";
1573 execve(a[0], a, env_use);
1574 } else if (argc > optind)
1575 execvpe(argv[optind], argv + optind, env_use);
1577 chdir(home ? home : "/root");
1578 execle("/bin/bash", "-bash", NULL, env_use);
1581 log_error("execv() failed: %m");
1584 _exit(EXIT_FAILURE);
1590 r = register_machine(pid);
1594 eventfd_write(sync_fd, 1);
1595 close_nointr_nofail(sync_fd);
1598 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1607 /* Kill if it is not dead yet anyway */
1608 terminate_machine(pid);
1610 /* Redundant, but better safe than sorry */
1613 k = wait_for_terminate(pid, &status);
1621 if (status.si_code == CLD_EXITED) {
1622 r = status.si_status;
1623 if (status.si_status != 0) {
1624 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1629 log_debug("Container %s exited successfully.", arg_machine);
1631 } else if (status.si_code == CLD_KILLED &&
1632 status.si_status == SIGINT) {
1635 log_info("Container %s has been shut down.", arg_machine);
1638 } else if (status.si_code == CLD_KILLED &&
1639 status.si_status == SIGHUP) {
1642 log_info("Container %s is being rebooted.", arg_machine);
1644 } else if (status.si_code == CLD_KILLED ||
1645 status.si_code == CLD_DUMPED) {
1647 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1651 log_error("Container %s failed due to unknown reason.", arg_machine);
1661 free(arg_directory);