1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <selinux/selinux.h>
48 #include "sd-daemon.h"
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
66 #include "bus-error.h"
68 #include "bus-kernel.h"
72 typedef enum LinkJournal {
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
112 (1ULL << CAP_SYS_RESOURCE) |
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
115 (1ULL << CAP_AUDIT_CONTROL) |
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122 static bool arg_register = true;
123 static bool arg_keep_unit = false;
125 static int help(void) {
127 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
128 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
129 " -h --help Show this help\n"
130 " --version Print version string\n"
131 " -D --directory=NAME Root directory for the container\n"
132 " -b --boot Boot up full system (i.e. invoke init)\n"
133 " -u --user=USER Run the command under specified user or uid\n"
134 " --uuid=UUID Set a specific machine UUID for the container\n"
135 " -M --machine=NAME Set the machine name for the container\n"
136 " -S --slice=SLICE Place the container in the specified slice\n"
137 " -Z --selinux-context=SECLABEL\n"
138 " Set the SELinux security context to be used by\n"
139 " processes in the container\n"
140 " -L --selinux-apifs-context=SECLABEL\n"
141 " Set the SELinux security context to be used by\n"
142 " API/tmpfs file systems in the container\n"
143 " --private-network Disable network in container\n"
144 " --share-system Share system namespaces with host\n"
145 " --read-only Mount the root directory read-only\n"
146 " --capability=CAP In addition to the default, retain specified\n"
148 " --drop-capability=CAP Drop the specified capability from the default set\n"
149 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
150 " -j Equivalent to --link-journal=host\n"
151 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
153 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
154 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
155 " --register=BOOLEAN Register container as machine\n"
156 " --keep-unit Do not register a scope for the machine, reuse\n"
157 " the service unit nspawn is running in\n"
158 " -q --quiet Do not show status information\n",
159 program_invocation_short_name);
164 static int parse_argv(int argc, char *argv[]) {
182 static const struct option options[] = {
183 { "help", no_argument, NULL, 'h' },
184 { "version", no_argument, NULL, ARG_VERSION },
185 { "directory", required_argument, NULL, 'D' },
186 { "user", required_argument, NULL, 'u' },
187 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
188 { "boot", no_argument, NULL, 'b' },
189 { "uuid", required_argument, NULL, ARG_UUID },
190 { "read-only", no_argument, NULL, ARG_READ_ONLY },
191 { "capability", required_argument, NULL, ARG_CAPABILITY },
192 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
193 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
194 { "bind", required_argument, NULL, ARG_BIND },
195 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
196 { "machine", required_argument, NULL, 'M' },
197 { "slice", required_argument, NULL, 'S' },
198 { "setenv", required_argument, NULL, ARG_SETENV },
199 { "selinux-context", required_argument, NULL, 'Z' },
200 { "selinux-apifs-context", required_argument, NULL, 'L' },
201 { "quiet", no_argument, NULL, 'q' },
202 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
203 { "register", required_argument, NULL, ARG_REGISTER },
204 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
213 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
221 puts(PACKAGE_STRING);
222 puts(SYSTEMD_FEATURES);
227 arg_directory = canonicalize_file_name(optarg);
228 if (!arg_directory) {
229 log_error("Invalid root directory: %m");
237 arg_user = strdup(optarg);
243 case ARG_PRIVATE_NETWORK:
244 arg_private_network = true;
252 r = sd_id128_from_string(optarg, &arg_uuid);
254 log_error("Invalid UUID: %s", optarg);
260 arg_slice = strdup(optarg);
267 if (isempty(optarg)) {
272 if (!hostname_is_valid(optarg)) {
273 log_error("Invalid machine name: %s", optarg);
278 arg_machine = strdup(optarg);
286 arg_selinux_context = optarg;
290 arg_selinux_apifs_context = optarg;
294 arg_read_only = true;
298 case ARG_DROP_CAPABILITY: {
302 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
303 _cleanup_free_ char *t;
306 t = strndup(word, length);
310 if (streq(t, "all")) {
311 if (c == ARG_CAPABILITY)
312 arg_retain = (uint64_t) -1;
316 if (cap_from_name(t, &cap) < 0) {
317 log_error("Failed to parse capability %s.", t);
321 if (c == ARG_CAPABILITY)
322 arg_retain |= 1ULL << (uint64_t) cap;
324 arg_retain &= ~(1ULL << (uint64_t) cap);
332 arg_link_journal = LINK_GUEST;
335 case ARG_LINK_JOURNAL:
336 if (streq(optarg, "auto"))
337 arg_link_journal = LINK_AUTO;
338 else if (streq(optarg, "no"))
339 arg_link_journal = LINK_NO;
340 else if (streq(optarg, "guest"))
341 arg_link_journal = LINK_GUEST;
342 else if (streq(optarg, "host"))
343 arg_link_journal = LINK_HOST;
345 log_error("Failed to parse link journal mode %s", optarg);
353 _cleanup_free_ char *a = NULL, *b = NULL;
357 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
359 e = strchr(optarg, ':');
361 a = strndup(optarg, e - optarg);
371 if (!path_is_absolute(a) || !path_is_absolute(b)) {
372 log_error("Invalid bind mount specification: %s", optarg);
376 r = strv_extend(x, a);
380 r = strv_extend(x, b);
390 if (!env_assignment_is_valid(optarg)) {
391 log_error("Environment variable assignment '%s' is not valid.", optarg);
395 n = strv_env_set(arg_setenv, optarg);
399 strv_free(arg_setenv);
408 case ARG_SHARE_SYSTEM:
409 arg_share_system = true;
413 r = parse_boolean(optarg);
415 log_error("Failed to parse --register= argument: %s", optarg);
423 arg_keep_unit = true;
430 assert_not_reached("Unhandled option");
434 if (arg_share_system)
435 arg_register = false;
437 if (arg_boot && arg_share_system) {
438 log_error("--boot and --share-system may not be combined.");
442 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
443 log_error("--keep-unit may not be used when invoked from a user session.");
450 static int mount_all(const char *dest) {
452 typedef struct MountPoint {
461 static const MountPoint mount_table[] = {
462 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
463 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
464 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
465 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
466 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
467 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
468 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
469 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
471 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
472 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
479 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
480 _cleanup_free_ char *where = NULL;
482 _cleanup_free_ char *options = NULL;
487 where = strjoin(dest, "/", mount_table[k].where, NULL);
491 t = path_is_mount_point(where, true);
493 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
501 /* Skip this entry if it is not a remount. */
502 if (mount_table[k].what && t > 0)
505 mkdir_p(where, 0755);
508 if (arg_selinux_apifs_context &&
509 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
510 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
517 o = mount_table[k].options;
520 if (mount(mount_table[k].what,
523 mount_table[k].flags,
525 mount_table[k].fatal) {
527 log_error("mount(%s) failed: %m", where);
537 static int mount_binds(const char *dest, char **l, unsigned long flags) {
540 STRV_FOREACH_PAIR(x, y, l) {
542 struct stat source_st, dest_st;
545 if (stat(*x, &source_st) < 0) {
546 log_error("failed to stat %s: %m", *x);
550 where = strappenda(dest, *y);
551 r = stat(where, &dest_st);
553 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
554 log_error("The file types of %s and %s do not match. Refusing bind mount",
558 } else if (errno == ENOENT) {
559 r = mkdir_parents_label(where, 0755);
561 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
565 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
568 /* Create the mount point, but be conservative -- refuse to create block
569 * and char devices. */
570 if (S_ISDIR(source_st.st_mode))
571 mkdir_label(where, 0755);
572 else if (S_ISFIFO(source_st.st_mode))
574 else if (S_ISSOCK(source_st.st_mode))
575 mknod(where, 0644 | S_IFSOCK, 0);
576 else if (S_ISREG(source_st.st_mode))
579 log_error("Refusing to create mountpoint for file: %s", *x);
583 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
584 log_error("mount(%s) failed: %m", where);
588 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
589 log_error("mount(%s) failed: %m", where);
597 static int setup_timezone(const char *dest) {
598 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
604 /* Fix the timezone, if possible */
605 r = readlink_malloc("/etc/localtime", &p);
607 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
611 z = path_startswith(p, "../usr/share/zoneinfo/");
613 z = path_startswith(p, "/usr/share/zoneinfo/");
615 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
619 where = strappend(dest, "/etc/localtime");
623 r = readlink_malloc(where, &q);
625 y = path_startswith(q, "../usr/share/zoneinfo/");
627 y = path_startswith(q, "/usr/share/zoneinfo/");
630 /* Already pointing to the right place? Then do nothing .. */
631 if (y && streq(y, z))
635 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
639 if (access(check, F_OK) < 0) {
640 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
644 what = strappend("../usr/share/zoneinfo/", z);
649 if (symlink(what, where) < 0) {
650 log_error("Failed to correct timezone of container: %m");
657 static int setup_resolv_conf(const char *dest) {
658 char _cleanup_free_ *where = NULL;
662 if (arg_private_network)
665 /* Fix resolv.conf, if possible */
666 where = strappend(dest, "/etc/resolv.conf");
670 /* We don't really care for the results of this really. If it
671 * fails, it fails, but meh... */
672 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
677 static int setup_boot_id(const char *dest) {
678 _cleanup_free_ char *from = NULL, *to = NULL;
685 if (arg_share_system)
688 /* Generate a new randomized boot ID, so that each boot-up of
689 * the container gets a new one */
691 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
692 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
696 r = sd_id128_randomize(&rnd);
698 log_error("Failed to generate random boot id: %s", strerror(-r));
702 snprintf(as_uuid, sizeof(as_uuid),
703 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
704 SD_ID128_FORMAT_VAL(rnd));
705 char_array_0(as_uuid);
707 r = write_string_file(from, as_uuid);
709 log_error("Failed to write boot id: %s", strerror(-r));
713 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
714 log_error("Failed to bind mount boot id: %m");
716 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
717 log_warning("Failed to make boot id read-only: %m");
723 static int copy_devnodes(const char *dest) {
725 static const char devnodes[] =
735 _cleanup_umask_ mode_t u;
741 NULSTR_FOREACH(d, devnodes) {
742 _cleanup_free_ char *from = NULL, *to = NULL;
745 from = strappend("/dev/", d);
746 to = strjoin(dest, "/dev/", d, NULL);
750 if (stat(from, &st) < 0) {
752 if (errno != ENOENT) {
753 log_error("Failed to stat %s: %m", from);
757 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
759 log_error("%s is not a char or block device, cannot copy", from);
762 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
764 log_error("mknod(%s) failed: %m", dest);
772 static int setup_ptmx(const char *dest) {
773 _cleanup_free_ char *p = NULL;
775 p = strappend(dest, "/dev/ptmx");
779 if (symlink("pts/ptmx", p) < 0) {
780 log_error("Failed to create /dev/ptmx symlink: %m");
787 static int setup_dev_console(const char *dest, const char *console) {
789 _cleanup_free_ char *to = NULL;
791 _cleanup_umask_ mode_t u;
798 if (stat(console, &st) < 0) {
799 log_error("Failed to stat %s: %m", console);
802 } else if (!S_ISCHR(st.st_mode)) {
803 log_error("/dev/console is not a char device");
807 r = chmod_and_chown(console, 0600, 0, 0);
809 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
813 if (asprintf(&to, "%s/dev/console", dest) < 0)
816 /* We need to bind mount the right tty to /dev/console since
817 * ptys can only exist on pts file systems. To have something
818 * to bind mount things on we create a device node first, that
819 * has the right major/minor (note that the major minor
820 * doesn't actually matter here, since we mount it over
823 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
824 log_error("mknod() for /dev/console failed: %m");
828 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
829 log_error("Bind mount for /dev/console failed: %m");
836 static int setup_kmsg(const char *dest, int kmsg_socket) {
837 _cleanup_free_ char *from = NULL, *to = NULL;
839 _cleanup_umask_ mode_t u;
841 struct cmsghdr cmsghdr;
842 uint8_t buf[CMSG_SPACE(sizeof(int))];
845 .msg_control = &control,
846 .msg_controllen = sizeof(control),
848 struct cmsghdr *cmsg;
851 assert(kmsg_socket >= 0);
855 /* We create the kmsg FIFO as /dev/kmsg, but immediately
856 * delete it after bind mounting it to /proc/kmsg. While FIFOs
857 * on the reading side behave very similar to /proc/kmsg,
858 * their writing side behaves differently from /dev/kmsg in
859 * that writing blocks when nothing is reading. In order to
860 * avoid any problems with containers deadlocking due to this
861 * we simply make /dev/kmsg unavailable to the container. */
862 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
863 asprintf(&to, "%s/proc/kmsg", dest) < 0)
866 if (mkfifo(from, 0600) < 0) {
867 log_error("mkfifo() for /dev/kmsg failed: %m");
871 r = chmod_and_chown(from, 0600, 0, 0);
873 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
877 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
878 log_error("Bind mount for /proc/kmsg failed: %m");
882 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
884 log_error("Failed to open fifo: %m");
888 cmsg = CMSG_FIRSTHDR(&mh);
889 cmsg->cmsg_level = SOL_SOCKET;
890 cmsg->cmsg_type = SCM_RIGHTS;
891 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
892 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
894 mh.msg_controllen = cmsg->cmsg_len;
896 /* Store away the fd in the socket, so that it stays open as
897 * long as we run the child */
898 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
899 close_nointr_nofail(fd);
902 log_error("Failed to send FIFO fd: %m");
906 /* And now make the FIFO unavailable as /dev/kmsg... */
911 static int setup_hostname(void) {
913 if (arg_share_system)
916 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
922 static int setup_journal(const char *directory) {
923 sd_id128_t machine_id, this_id;
924 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
928 p = strappend(directory, "/etc/machine-id");
932 r = read_one_line_file(p, &b);
933 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
936 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
941 if (isempty(id) && arg_link_journal == LINK_AUTO)
944 /* Verify validity */
945 r = sd_id128_from_string(id, &machine_id);
947 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
951 r = sd_id128_get_machine(&this_id);
953 log_error("Failed to retrieve machine ID: %s", strerror(-r));
957 if (sd_id128_equal(machine_id, this_id)) {
958 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
959 "Host and machine ids are equal (%s): refusing to link journals", id);
960 if (arg_link_journal == LINK_AUTO)
966 if (arg_link_journal == LINK_NO)
970 p = strappend("/var/log/journal/", id);
971 q = strjoin(directory, "/var/log/journal/", id, NULL);
975 if (path_is_mount_point(p, false) > 0) {
976 if (arg_link_journal != LINK_AUTO) {
977 log_error("%s: already a mount point, refusing to use for journal", p);
984 if (path_is_mount_point(q, false) > 0) {
985 if (arg_link_journal != LINK_AUTO) {
986 log_error("%s: already a mount point, refusing to use for journal", q);
993 r = readlink_and_make_absolute(p, &d);
995 if ((arg_link_journal == LINK_GUEST ||
996 arg_link_journal == LINK_AUTO) &&
999 r = mkdir_p(q, 0755);
1001 log_warning("failed to create directory %s: %m", q);
1005 if (unlink(p) < 0) {
1006 log_error("Failed to remove symlink %s: %m", p);
1009 } else if (r == -EINVAL) {
1011 if (arg_link_journal == LINK_GUEST &&
1014 if (errno == ENOTDIR) {
1015 log_error("%s already exists and is neither a symlink nor a directory", p);
1018 log_error("Failed to remove %s: %m", p);
1022 } else if (r != -ENOENT) {
1023 log_error("readlink(%s) failed: %m", p);
1027 if (arg_link_journal == LINK_GUEST) {
1029 if (symlink(q, p) < 0) {
1030 log_error("Failed to symlink %s to %s: %m", q, p);
1034 r = mkdir_p(q, 0755);
1036 log_warning("failed to create directory %s: %m", q);
1040 if (arg_link_journal == LINK_HOST) {
1041 r = mkdir_p(p, 0755);
1043 log_error("Failed to create %s: %m", p);
1047 } else if (access(p, F_OK) < 0)
1050 if (dir_is_empty(q) == 0) {
1051 log_error("%s not empty.", q);
1055 r = mkdir_p(q, 0755);
1057 log_error("Failed to create %s: %m", q);
1061 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1062 log_error("Failed to bind mount journal from host into guest: %m");
1069 static int setup_kdbus(const char *dest, const char *path) {
1075 p = strappenda(dest, "/dev/kdbus");
1076 if (mkdir(p, 0755) < 0) {
1077 log_error("Failed to create kdbus path: %m");
1081 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1082 log_error("Failed to mount kdbus domain path: %m");
1089 static int drop_capabilities(void) {
1090 return capability_bounding_set_drop(~arg_retain, false);
1093 static int register_machine(pid_t pid) {
1094 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1095 _cleanup_bus_unref_ sd_bus *bus = NULL;
1101 r = sd_bus_default_system(&bus);
1103 log_error("Failed to open system bus: %s", strerror(-r));
1107 if (arg_keep_unit) {
1108 r = sd_bus_call_method(
1110 "org.freedesktop.machine1",
1111 "/org/freedesktop/machine1",
1112 "org.freedesktop.machine1.Manager",
1118 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1122 strempty(arg_directory));
1124 r = sd_bus_call_method(
1126 "org.freedesktop.machine1",
1127 "/org/freedesktop/machine1",
1128 "org.freedesktop.machine1.Manager",
1134 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1138 strempty(arg_directory),
1139 !isempty(arg_slice), "Slice", "s", arg_slice);
1143 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1150 static int terminate_machine(pid_t pid) {
1151 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1152 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1153 _cleanup_bus_unref_ sd_bus *bus = NULL;
1160 r = sd_bus_default_system(&bus);
1162 log_error("Failed to open system bus: %s", strerror(-r));
1166 r = sd_bus_call_method(
1168 "org.freedesktop.machine1",
1169 "/org/freedesktop/machine1",
1170 "org.freedesktop.machine1.Manager",
1177 /* Note that the machine might already have been
1178 * cleaned up automatically, hence don't consider it a
1179 * failure if we cannot get the machine object. */
1180 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1184 r = sd_bus_message_read(reply, "o", &path);
1186 return bus_log_parse_error(r);
1188 r = sd_bus_call_method(
1190 "org.freedesktop.machine1",
1192 "org.freedesktop.machine1.Machine",
1198 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1205 static int reset_audit_loginuid(void) {
1206 _cleanup_free_ char *p = NULL;
1209 if (arg_share_system)
1212 r = read_one_line_file("/proc/self/loginuid", &p);
1216 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1220 /* Already reset? */
1221 if (streq(p, "4294967295"))
1224 r = write_string_file("/proc/self/loginuid", "4294967295");
1226 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1227 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1228 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1229 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1230 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1238 int main(int argc, char *argv[]) {
1240 int r = EXIT_FAILURE, k;
1241 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1243 const char *console = NULL;
1245 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1246 _cleanup_fdset_free_ FDSet *fds = NULL;
1247 _cleanup_free_ char *kdbus_domain = NULL;
1249 log_parse_environment();
1252 k = parse_argv(argc, argv);
1260 if (arg_directory) {
1263 p = path_make_absolute_cwd(arg_directory);
1264 free(arg_directory);
1267 arg_directory = get_current_dir_name();
1269 if (!arg_directory) {
1270 log_error("Failed to determine path, please use -D.");
1274 path_kill_slashes(arg_directory);
1277 arg_machine = strdup(basename(arg_directory));
1283 hostname_cleanup(arg_machine, false);
1284 if (isempty(arg_machine)) {
1285 log_error("Failed to determine machine name automatically, please use -M.");
1290 if (geteuid() != 0) {
1291 log_error("Need to be root.");
1295 if (sd_booted() <= 0) {
1296 log_error("Not running on a systemd system.");
1300 if (path_equal(arg_directory, "/")) {
1301 log_error("Spawning container on root directory not supported.");
1305 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1306 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1311 n_fd_passed = sd_listen_fds(false);
1312 if (n_fd_passed > 0) {
1313 k = fdset_new_listen_fds(&fds, false);
1315 log_error("Failed to collect file descriptors: %s", strerror(-k));
1319 fdset_close_others(fds);
1322 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1324 log_error("Failed to acquire pseudo tty: %m");
1328 console = ptsname(master);
1330 log_error("Failed to determine tty name: %m");
1335 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1337 if (unlockpt(master) < 0) {
1338 log_error("Failed to unlock tty: %m");
1343 if (access("/dev/kdbus/control", F_OK) >= 0) {
1345 if (arg_share_system) {
1346 kdbus_domain = strdup("/dev/kdbus");
1347 if (!kdbus_domain) {
1354 ns = strappenda("machine-", arg_machine);
1355 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1357 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1359 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1363 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1364 log_error("Failed to create kmsg socket pair: %m");
1368 sd_notify(0, "READY=1");
1370 assert_se(sigemptyset(&mask) == 0);
1371 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1372 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1377 sync_fd = eventfd(0, EFD_CLOEXEC);
1379 log_error("Failed to create event fd: %m");
1383 pid = syscall(__NR_clone,
1384 SIGCHLD|CLONE_NEWNS|
1385 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1386 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1388 if (errno == EINVAL)
1389 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1391 log_error("clone() failed: %m");
1398 const char *home = NULL;
1399 uid_t uid = (uid_t) -1;
1400 gid_t gid = (gid_t) -1;
1402 const char *envp[] = {
1403 "PATH=" DEFAULT_PATH_SPLIT_USR,
1404 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1409 NULL, /* container_uuid */
1410 NULL, /* LISTEN_FDS */
1411 NULL, /* LISTEN_PID */
1417 envp[n_env] = strv_find_prefix(environ, "TERM=");
1421 close_nointr_nofail(master);
1424 close_nointr(STDIN_FILENO);
1425 close_nointr(STDOUT_FILENO);
1426 close_nointr(STDERR_FILENO);
1428 close_nointr_nofail(kmsg_socket_pair[0]);
1429 kmsg_socket_pair[0] = -1;
1431 reset_all_signal_handlers();
1433 assert_se(sigemptyset(&mask) == 0);
1434 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1436 k = open_terminal(console, O_RDWR);
1437 if (k != STDIN_FILENO) {
1439 close_nointr_nofail(k);
1443 log_error("Failed to open console: %s", strerror(-k));
1447 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1448 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1449 log_error("Failed to duplicate console: %m");
1454 log_error("setsid() failed: %m");
1458 if (reset_audit_loginuid() < 0)
1461 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1462 log_error("PR_SET_PDEATHSIG failed: %m");
1466 /* Mark everything as slave, so that we still
1467 * receive mounts from the real root, but don't
1468 * propagate mounts to the real root. */
1469 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1470 log_error("MS_SLAVE|MS_REC failed: %m");
1474 /* Turn directory into bind mount */
1475 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1476 log_error("Failed to make bind mount.");
1481 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1482 log_error("Failed to make read-only.");
1486 if (mount_all(arg_directory) < 0)
1489 if (copy_devnodes(arg_directory) < 0)
1492 if (setup_ptmx(arg_directory) < 0)
1495 dev_setup(arg_directory);
1497 if (setup_dev_console(arg_directory, console) < 0)
1500 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1503 close_nointr_nofail(kmsg_socket_pair[1]);
1504 kmsg_socket_pair[1] = -1;
1506 if (setup_boot_id(arg_directory) < 0)
1509 if (setup_timezone(arg_directory) < 0)
1512 if (setup_resolv_conf(arg_directory) < 0)
1515 if (setup_journal(arg_directory) < 0)
1518 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1521 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1524 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1527 if (chdir(arg_directory) < 0) {
1528 log_error("chdir(%s) failed: %m", arg_directory);
1532 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1533 log_error("mount(MS_MOVE) failed: %m");
1537 if (chroot(".") < 0) {
1538 log_error("chroot() failed: %m");
1542 if (chdir("/") < 0) {
1543 log_error("chdir() failed: %m");
1549 if (arg_private_network)
1552 if (drop_capabilities() < 0) {
1553 log_error("drop_capabilities() failed: %m");
1559 /* Note that this resolves user names
1560 * inside the container, and hence
1561 * accesses the NSS modules from the
1562 * container and not the host. This is
1565 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1566 log_error("get_user_creds() failed: %m");
1570 if (mkdir_parents_label(home, 0775) < 0) {
1571 log_error("mkdir_parents_label() failed: %m");
1575 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1576 log_error("mkdir_safe_label() failed: %m");
1580 if (initgroups((const char*)arg_user, gid) < 0) {
1581 log_error("initgroups() failed: %m");
1585 if (setresgid(gid, gid, gid) < 0) {
1586 log_error("setregid() failed: %m");
1590 if (setresuid(uid, uid, uid) < 0) {
1591 log_error("setreuid() failed: %m");
1595 /* Reset everything fully to 0, just in case */
1597 if (setgroups(0, NULL) < 0) {
1598 log_error("setgroups() failed: %m");
1602 if (setresgid(0, 0, 0) < 0) {
1603 log_error("setregid() failed: %m");
1607 if (setresuid(0, 0, 0) < 0) {
1608 log_error("setreuid() failed: %m");
1613 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1614 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1615 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1620 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1621 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1627 if (fdset_size(fds) > 0) {
1628 k = fdset_cloexec(fds, false);
1630 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1634 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1635 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1643 eventfd_read(sync_fd, &x);
1644 close_nointr_nofail(sync_fd);
1647 if (!strv_isempty(arg_setenv)) {
1650 n = strv_env_merge(2, envp, arg_setenv);
1658 env_use = (char**) envp;
1661 if (arg_selinux_context)
1662 if (setexeccon(arg_selinux_context) < 0)
1663 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1669 /* Automatically search for the init system */
1671 l = 1 + argc - optind;
1672 a = newa(char*, l + 1);
1673 memcpy(a + 1, argv + optind, l * sizeof(char*));
1675 a[0] = (char*) "/usr/lib/systemd/systemd";
1676 execve(a[0], a, env_use);
1678 a[0] = (char*) "/lib/systemd/systemd";
1679 execve(a[0], a, env_use);
1681 a[0] = (char*) "/sbin/init";
1682 execve(a[0], a, env_use);
1683 } else if (argc > optind)
1684 execvpe(argv[optind], argv + optind, env_use);
1686 chdir(home ? home : "/root");
1687 execle("/bin/bash", "-bash", NULL, env_use);
1690 log_error("execv() failed: %m");
1693 _exit(EXIT_FAILURE);
1699 r = register_machine(pid);
1703 eventfd_write(sync_fd, 1);
1704 close_nointr_nofail(sync_fd);
1707 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1716 /* Kill if it is not dead yet anyway */
1717 terminate_machine(pid);
1719 /* Redundant, but better safe than sorry */
1722 k = wait_for_terminate(pid, &status);
1730 if (status.si_code == CLD_EXITED) {
1731 r = status.si_status;
1732 if (status.si_status != 0) {
1733 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1738 log_debug("Container %s exited successfully.", arg_machine);
1740 } else if (status.si_code == CLD_KILLED &&
1741 status.si_status == SIGINT) {
1744 log_info("Container %s has been shut down.", arg_machine);
1747 } else if (status.si_code == CLD_KILLED &&
1748 status.si_status == SIGHUP) {
1751 log_info("Container %s is being rebooted.", arg_machine);
1753 } else if (status.si_code == CLD_KILLED ||
1754 status.si_code == CLD_DUMPED) {
1756 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1760 log_error("Container %s failed due to unknown reason.", arg_machine);
1770 free(arg_directory);