1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
48 #include <selinux/selinux.h>
51 #include "sd-daemon.h"
61 #include "cgroup-util.h"
63 #include "path-util.h"
64 #include "loopback-setup.h"
65 #include "dev-setup.h"
70 #include "bus-error.h"
72 #include "bus-kernel.h"
75 #include "rtnl-util.h"
77 typedef enum LinkJournal {
84 static char *arg_directory = NULL;
85 static char *arg_user = NULL;
86 static sd_id128_t arg_uuid = {};
87 static char *arg_machine = NULL;
88 static char *arg_selinux_context = NULL;
89 static char *arg_selinux_apifs_context = NULL;
90 static const char *arg_slice = NULL;
91 static bool arg_private_network = false;
92 static bool arg_read_only = false;
93 static bool arg_boot = false;
94 static LinkJournal arg_link_journal = LINK_AUTO;
95 static uint64_t arg_retain =
97 (1ULL << CAP_DAC_OVERRIDE) |
98 (1ULL << CAP_DAC_READ_SEARCH) |
99 (1ULL << CAP_FOWNER) |
100 (1ULL << CAP_FSETID) |
101 (1ULL << CAP_IPC_OWNER) |
103 (1ULL << CAP_LEASE) |
104 (1ULL << CAP_LINUX_IMMUTABLE) |
105 (1ULL << CAP_NET_BIND_SERVICE) |
106 (1ULL << CAP_NET_BROADCAST) |
107 (1ULL << CAP_NET_RAW) |
108 (1ULL << CAP_SETGID) |
109 (1ULL << CAP_SETFCAP) |
110 (1ULL << CAP_SETPCAP) |
111 (1ULL << CAP_SETUID) |
112 (1ULL << CAP_SYS_ADMIN) |
113 (1ULL << CAP_SYS_CHROOT) |
114 (1ULL << CAP_SYS_NICE) |
115 (1ULL << CAP_SYS_PTRACE) |
116 (1ULL << CAP_SYS_TTY_CONFIG) |
117 (1ULL << CAP_SYS_RESOURCE) |
118 (1ULL << CAP_SYS_BOOT) |
119 (1ULL << CAP_AUDIT_WRITE) |
120 (1ULL << CAP_AUDIT_CONTROL) |
122 static char **arg_bind = NULL;
123 static char **arg_bind_ro = NULL;
124 static char **arg_setenv = NULL;
125 static bool arg_quiet = false;
126 static bool arg_share_system = false;
127 static bool arg_register = true;
128 static bool arg_keep_unit = false;
129 static char **arg_network_interfaces = NULL;
131 static int help(void) {
133 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
134 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
135 " -h --help Show this help\n"
136 " --version Print version string\n"
137 " -D --directory=NAME Root directory for the container\n"
138 " -b --boot Boot up full system (i.e. invoke init)\n"
139 " -u --user=USER Run the command under specified user or uid\n"
140 " --uuid=UUID Set a specific machine UUID for the container\n"
141 " -M --machine=NAME Set the machine name for the container\n"
142 " -S --slice=SLICE Place the container in the specified slice\n"
143 " -Z --selinux-context=SECLABEL\n"
144 " Set the SELinux security context to be used by\n"
145 " processes in the container\n"
146 " -L --selinux-apifs-context=SECLABEL\n"
147 " Set the SELinux security context to be used by\n"
148 " API/tmpfs file systems in the container\n"
149 " --private-network Disable network in container\n"
150 " --network-interface=INTERFACE\n"
151 " Assign an existing network interface to the container\n"
152 " --share-system Share system namespaces with host\n"
153 " --read-only Mount the root directory read-only\n"
154 " --capability=CAP In addition to the default, retain specified\n"
156 " --drop-capability=CAP Drop the specified capability from the default set\n"
157 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
158 " -j Equivalent to --link-journal=host\n"
159 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
161 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
162 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
163 " --register=BOOLEAN Register container as machine\n"
164 " --keep-unit Do not register a scope for the machine, reuse\n"
165 " the service unit nspawn is running in\n"
166 " -q --quiet Do not show status information\n",
167 program_invocation_short_name);
172 static int parse_argv(int argc, char *argv[]) {
188 ARG_NETWORK_INTERFACE
191 static const struct option options[] = {
192 { "help", no_argument, NULL, 'h' },
193 { "version", no_argument, NULL, ARG_VERSION },
194 { "directory", required_argument, NULL, 'D' },
195 { "user", required_argument, NULL, 'u' },
196 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
197 { "boot", no_argument, NULL, 'b' },
198 { "uuid", required_argument, NULL, ARG_UUID },
199 { "read-only", no_argument, NULL, ARG_READ_ONLY },
200 { "capability", required_argument, NULL, ARG_CAPABILITY },
201 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
202 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
203 { "bind", required_argument, NULL, ARG_BIND },
204 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
205 { "machine", required_argument, NULL, 'M' },
206 { "slice", required_argument, NULL, 'S' },
207 { "setenv", required_argument, NULL, ARG_SETENV },
208 { "selinux-context", required_argument, NULL, 'Z' },
209 { "selinux-apifs-context", required_argument, NULL, 'L' },
210 { "quiet", no_argument, NULL, 'q' },
211 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
212 { "register", required_argument, NULL, ARG_REGISTER },
213 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
214 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
223 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
231 puts(PACKAGE_STRING);
232 puts(SYSTEMD_FEATURES);
237 arg_directory = canonicalize_file_name(optarg);
238 if (!arg_directory) {
239 log_error("Invalid root directory: %m");
247 arg_user = strdup(optarg);
253 case ARG_NETWORK_INTERFACE:
254 if (strv_push(&arg_network_interfaces, optarg) < 0)
259 case ARG_PRIVATE_NETWORK:
260 arg_private_network = true;
268 r = sd_id128_from_string(optarg, &arg_uuid);
270 log_error("Invalid UUID: %s", optarg);
276 arg_slice = strdup(optarg);
283 if (isempty(optarg)) {
288 if (!hostname_is_valid(optarg)) {
289 log_error("Invalid machine name: %s", optarg);
294 arg_machine = strdup(optarg);
302 arg_selinux_context = optarg;
306 arg_selinux_apifs_context = optarg;
310 arg_read_only = true;
314 case ARG_DROP_CAPABILITY: {
318 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
319 _cleanup_free_ char *t;
322 t = strndup(word, length);
326 if (streq(t, "all")) {
327 if (c == ARG_CAPABILITY)
328 arg_retain = (uint64_t) -1;
332 if (cap_from_name(t, &cap) < 0) {
333 log_error("Failed to parse capability %s.", t);
337 if (c == ARG_CAPABILITY)
338 arg_retain |= 1ULL << (uint64_t) cap;
340 arg_retain &= ~(1ULL << (uint64_t) cap);
348 arg_link_journal = LINK_GUEST;
351 case ARG_LINK_JOURNAL:
352 if (streq(optarg, "auto"))
353 arg_link_journal = LINK_AUTO;
354 else if (streq(optarg, "no"))
355 arg_link_journal = LINK_NO;
356 else if (streq(optarg, "guest"))
357 arg_link_journal = LINK_GUEST;
358 else if (streq(optarg, "host"))
359 arg_link_journal = LINK_HOST;
361 log_error("Failed to parse link journal mode %s", optarg);
369 _cleanup_free_ char *a = NULL, *b = NULL;
373 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
375 e = strchr(optarg, ':');
377 a = strndup(optarg, e - optarg);
387 if (!path_is_absolute(a) || !path_is_absolute(b)) {
388 log_error("Invalid bind mount specification: %s", optarg);
392 r = strv_extend(x, a);
396 r = strv_extend(x, b);
406 if (!env_assignment_is_valid(optarg)) {
407 log_error("Environment variable assignment '%s' is not valid.", optarg);
411 n = strv_env_set(arg_setenv, optarg);
415 strv_free(arg_setenv);
424 case ARG_SHARE_SYSTEM:
425 arg_share_system = true;
429 r = parse_boolean(optarg);
431 log_error("Failed to parse --register= argument: %s", optarg);
439 arg_keep_unit = true;
446 assert_not_reached("Unhandled option");
450 if (arg_share_system)
451 arg_register = false;
453 if (arg_boot && arg_share_system) {
454 log_error("--boot and --share-system may not be combined.");
458 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
459 log_error("--keep-unit may not be used when invoked from a user session.");
466 static int mount_all(const char *dest) {
468 typedef struct MountPoint {
477 static const MountPoint mount_table[] = {
478 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
479 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
480 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
481 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
482 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
483 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
484 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
485 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
487 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
488 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
495 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
496 _cleanup_free_ char *where = NULL;
498 _cleanup_free_ char *options = NULL;
503 where = strjoin(dest, "/", mount_table[k].where, NULL);
507 t = path_is_mount_point(where, true);
509 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
517 /* Skip this entry if it is not a remount. */
518 if (mount_table[k].what && t > 0)
521 mkdir_p(where, 0755);
524 if (arg_selinux_apifs_context &&
525 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
526 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
533 o = mount_table[k].options;
536 if (mount(mount_table[k].what,
539 mount_table[k].flags,
541 mount_table[k].fatal) {
543 log_error("mount(%s) failed: %m", where);
553 static int mount_binds(const char *dest, char **l, unsigned long flags) {
556 STRV_FOREACH_PAIR(x, y, l) {
558 struct stat source_st, dest_st;
561 if (stat(*x, &source_st) < 0) {
562 log_error("failed to stat %s: %m", *x);
566 where = strappenda(dest, *y);
567 r = stat(where, &dest_st);
569 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
570 log_error("The file types of %s and %s do not match. Refusing bind mount",
574 } else if (errno == ENOENT) {
575 r = mkdir_parents_label(where, 0755);
577 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
581 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
584 /* Create the mount point, but be conservative -- refuse to create block
585 * and char devices. */
586 if (S_ISDIR(source_st.st_mode))
587 mkdir_label(where, 0755);
588 else if (S_ISFIFO(source_st.st_mode))
590 else if (S_ISSOCK(source_st.st_mode))
591 mknod(where, 0644 | S_IFSOCK, 0);
592 else if (S_ISREG(source_st.st_mode))
595 log_error("Refusing to create mountpoint for file: %s", *x);
599 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
600 log_error("mount(%s) failed: %m", where);
604 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
605 log_error("mount(%s) failed: %m", where);
613 static int setup_timezone(const char *dest) {
614 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
620 /* Fix the timezone, if possible */
621 r = readlink_malloc("/etc/localtime", &p);
623 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
627 z = path_startswith(p, "../usr/share/zoneinfo/");
629 z = path_startswith(p, "/usr/share/zoneinfo/");
631 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
635 where = strappend(dest, "/etc/localtime");
639 r = readlink_malloc(where, &q);
641 y = path_startswith(q, "../usr/share/zoneinfo/");
643 y = path_startswith(q, "/usr/share/zoneinfo/");
646 /* Already pointing to the right place? Then do nothing .. */
647 if (y && streq(y, z))
651 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
655 if (access(check, F_OK) < 0) {
656 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
660 what = strappend("../usr/share/zoneinfo/", z);
665 if (symlink(what, where) < 0) {
666 log_error("Failed to correct timezone of container: %m");
673 static int setup_resolv_conf(const char *dest) {
674 char _cleanup_free_ *where = NULL;
678 if (arg_private_network)
681 /* Fix resolv.conf, if possible */
682 where = strappend(dest, "/etc/resolv.conf");
686 /* We don't really care for the results of this really. If it
687 * fails, it fails, but meh... */
688 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
693 static int setup_boot_id(const char *dest) {
694 _cleanup_free_ char *from = NULL, *to = NULL;
701 if (arg_share_system)
704 /* Generate a new randomized boot ID, so that each boot-up of
705 * the container gets a new one */
707 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
708 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
712 r = sd_id128_randomize(&rnd);
714 log_error("Failed to generate random boot id: %s", strerror(-r));
718 snprintf(as_uuid, sizeof(as_uuid),
719 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
720 SD_ID128_FORMAT_VAL(rnd));
721 char_array_0(as_uuid);
723 r = write_string_file(from, as_uuid);
725 log_error("Failed to write boot id: %s", strerror(-r));
729 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
730 log_error("Failed to bind mount boot id: %m");
732 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
733 log_warning("Failed to make boot id read-only: %m");
739 static int copy_devnodes(const char *dest) {
741 static const char devnodes[] =
751 _cleanup_umask_ mode_t u;
757 NULSTR_FOREACH(d, devnodes) {
758 _cleanup_free_ char *from = NULL, *to = NULL;
761 from = strappend("/dev/", d);
762 to = strjoin(dest, "/dev/", d, NULL);
766 if (stat(from, &st) < 0) {
768 if (errno != ENOENT) {
769 log_error("Failed to stat %s: %m", from);
773 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
775 log_error("%s is not a char or block device, cannot copy", from);
778 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
780 log_error("mknod(%s) failed: %m", dest);
788 static int setup_ptmx(const char *dest) {
789 _cleanup_free_ char *p = NULL;
791 p = strappend(dest, "/dev/ptmx");
795 if (symlink("pts/ptmx", p) < 0) {
796 log_error("Failed to create /dev/ptmx symlink: %m");
803 static int setup_dev_console(const char *dest, const char *console) {
805 _cleanup_free_ char *to = NULL;
807 _cleanup_umask_ mode_t u;
814 if (stat(console, &st) < 0) {
815 log_error("Failed to stat %s: %m", console);
818 } else if (!S_ISCHR(st.st_mode)) {
819 log_error("/dev/console is not a char device");
823 r = chmod_and_chown(console, 0600, 0, 0);
825 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
829 if (asprintf(&to, "%s/dev/console", dest) < 0)
832 /* We need to bind mount the right tty to /dev/console since
833 * ptys can only exist on pts file systems. To have something
834 * to bind mount things on we create a device node first, that
835 * has the right major/minor (note that the major minor
836 * doesn't actually matter here, since we mount it over
839 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
840 log_error("mknod() for /dev/console failed: %m");
844 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
845 log_error("Bind mount for /dev/console failed: %m");
852 static int setup_kmsg(const char *dest, int kmsg_socket) {
853 _cleanup_free_ char *from = NULL, *to = NULL;
855 _cleanup_umask_ mode_t u;
857 struct cmsghdr cmsghdr;
858 uint8_t buf[CMSG_SPACE(sizeof(int))];
861 .msg_control = &control,
862 .msg_controllen = sizeof(control),
864 struct cmsghdr *cmsg;
867 assert(kmsg_socket >= 0);
871 /* We create the kmsg FIFO as /dev/kmsg, but immediately
872 * delete it after bind mounting it to /proc/kmsg. While FIFOs
873 * on the reading side behave very similar to /proc/kmsg,
874 * their writing side behaves differently from /dev/kmsg in
875 * that writing blocks when nothing is reading. In order to
876 * avoid any problems with containers deadlocking due to this
877 * we simply make /dev/kmsg unavailable to the container. */
878 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
879 asprintf(&to, "%s/proc/kmsg", dest) < 0)
882 if (mkfifo(from, 0600) < 0) {
883 log_error("mkfifo() for /dev/kmsg failed: %m");
887 r = chmod_and_chown(from, 0600, 0, 0);
889 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
893 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
894 log_error("Bind mount for /proc/kmsg failed: %m");
898 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
900 log_error("Failed to open fifo: %m");
904 cmsg = CMSG_FIRSTHDR(&mh);
905 cmsg->cmsg_level = SOL_SOCKET;
906 cmsg->cmsg_type = SCM_RIGHTS;
907 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
908 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
910 mh.msg_controllen = cmsg->cmsg_len;
912 /* Store away the fd in the socket, so that it stays open as
913 * long as we run the child */
914 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
915 close_nointr_nofail(fd);
918 log_error("Failed to send FIFO fd: %m");
922 /* And now make the FIFO unavailable as /dev/kmsg... */
927 static int setup_hostname(void) {
929 if (arg_share_system)
932 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
938 static int setup_journal(const char *directory) {
939 sd_id128_t machine_id, this_id;
940 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
944 p = strappend(directory, "/etc/machine-id");
948 r = read_one_line_file(p, &b);
949 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
952 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
957 if (isempty(id) && arg_link_journal == LINK_AUTO)
960 /* Verify validity */
961 r = sd_id128_from_string(id, &machine_id);
963 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
967 r = sd_id128_get_machine(&this_id);
969 log_error("Failed to retrieve machine ID: %s", strerror(-r));
973 if (sd_id128_equal(machine_id, this_id)) {
974 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
975 "Host and machine ids are equal (%s): refusing to link journals", id);
976 if (arg_link_journal == LINK_AUTO)
982 if (arg_link_journal == LINK_NO)
986 p = strappend("/var/log/journal/", id);
987 q = strjoin(directory, "/var/log/journal/", id, NULL);
991 if (path_is_mount_point(p, false) > 0) {
992 if (arg_link_journal != LINK_AUTO) {
993 log_error("%s: already a mount point, refusing to use for journal", p);
1000 if (path_is_mount_point(q, false) > 0) {
1001 if (arg_link_journal != LINK_AUTO) {
1002 log_error("%s: already a mount point, refusing to use for journal", q);
1009 r = readlink_and_make_absolute(p, &d);
1011 if ((arg_link_journal == LINK_GUEST ||
1012 arg_link_journal == LINK_AUTO) &&
1015 r = mkdir_p(q, 0755);
1017 log_warning("failed to create directory %s: %m", q);
1021 if (unlink(p) < 0) {
1022 log_error("Failed to remove symlink %s: %m", p);
1025 } else if (r == -EINVAL) {
1027 if (arg_link_journal == LINK_GUEST &&
1030 if (errno == ENOTDIR) {
1031 log_error("%s already exists and is neither a symlink nor a directory", p);
1034 log_error("Failed to remove %s: %m", p);
1038 } else if (r != -ENOENT) {
1039 log_error("readlink(%s) failed: %m", p);
1043 if (arg_link_journal == LINK_GUEST) {
1045 if (symlink(q, p) < 0) {
1046 log_error("Failed to symlink %s to %s: %m", q, p);
1050 r = mkdir_p(q, 0755);
1052 log_warning("failed to create directory %s: %m", q);
1056 if (arg_link_journal == LINK_HOST) {
1057 r = mkdir_p(p, 0755);
1059 log_error("Failed to create %s: %m", p);
1063 } else if (access(p, F_OK) < 0)
1066 if (dir_is_empty(q) == 0) {
1067 log_error("%s not empty.", q);
1071 r = mkdir_p(q, 0755);
1073 log_error("Failed to create %s: %m", q);
1077 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1078 log_error("Failed to bind mount journal from host into guest: %m");
1085 static int setup_kdbus(const char *dest, const char *path) {
1091 p = strappenda(dest, "/dev/kdbus");
1092 if (mkdir(p, 0755) < 0) {
1093 log_error("Failed to create kdbus path: %m");
1097 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1098 log_error("Failed to mount kdbus domain path: %m");
1105 static int drop_capabilities(void) {
1106 return capability_bounding_set_drop(~arg_retain, false);
1109 static int register_machine(pid_t pid) {
1110 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1111 _cleanup_bus_unref_ sd_bus *bus = NULL;
1117 r = sd_bus_default_system(&bus);
1119 log_error("Failed to open system bus: %s", strerror(-r));
1123 if (arg_keep_unit) {
1124 r = sd_bus_call_method(
1126 "org.freedesktop.machine1",
1127 "/org/freedesktop/machine1",
1128 "org.freedesktop.machine1.Manager",
1134 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1138 strempty(arg_directory));
1140 r = sd_bus_call_method(
1142 "org.freedesktop.machine1",
1143 "/org/freedesktop/machine1",
1144 "org.freedesktop.machine1.Manager",
1150 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1154 strempty(arg_directory),
1155 !isempty(arg_slice), "Slice", "s", arg_slice);
1159 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1166 static int terminate_machine(pid_t pid) {
1167 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1168 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1169 _cleanup_bus_unref_ sd_bus *bus = NULL;
1176 r = sd_bus_default_system(&bus);
1178 log_error("Failed to open system bus: %s", strerror(-r));
1182 r = sd_bus_call_method(
1184 "org.freedesktop.machine1",
1185 "/org/freedesktop/machine1",
1186 "org.freedesktop.machine1.Manager",
1193 /* Note that the machine might already have been
1194 * cleaned up automatically, hence don't consider it a
1195 * failure if we cannot get the machine object. */
1196 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1200 r = sd_bus_message_read(reply, "o", &path);
1202 return bus_log_parse_error(r);
1204 r = sd_bus_call_method(
1206 "org.freedesktop.machine1",
1208 "org.freedesktop.machine1.Machine",
1214 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1221 static int reset_audit_loginuid(void) {
1222 _cleanup_free_ char *p = NULL;
1225 if (arg_share_system)
1228 r = read_one_line_file("/proc/self/loginuid", &p);
1232 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1236 /* Already reset? */
1237 if (streq(p, "4294967295"))
1240 r = write_string_file("/proc/self/loginuid", "4294967295");
1242 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1243 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1244 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1245 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1246 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1254 static int move_network_interfaces(pid_t pid) {
1255 _cleanup_sd_rtnl_unref_ sd_rtnl *rtnl = NULL;
1259 if (!arg_private_network)
1262 if (strv_isempty(arg_network_interfaces))
1265 r = sd_rtnl_open(NETLINK_ROUTE, &rtnl);
1267 log_error("Failed to connect to netlink: %s", strerror(-r));
1271 STRV_FOREACH(i, arg_network_interfaces) {
1272 _cleanup_sd_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1275 ifi = if_nametoindex(*i);
1277 log_error("Failed to resolve interface %s: %m", *i);
1281 r = sd_rtnl_message_link_new(RTM_NEWLINK, ifi, &m);
1283 log_error("Failed to allocate netlink message: %s", strerror(-r));
1287 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1289 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1293 r = sd_rtnl_call(rtnl, m, 0, NULL);
1295 log_error("Failed to move interface to namespace: %s", strerror(-r));
1303 int main(int argc, char *argv[]) {
1305 int r = EXIT_FAILURE, k;
1306 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1308 const char *console = NULL;
1310 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1311 _cleanup_fdset_free_ FDSet *fds = NULL;
1312 _cleanup_free_ char *kdbus_domain = NULL;
1314 log_parse_environment();
1317 k = parse_argv(argc, argv);
1325 if (arg_directory) {
1328 p = path_make_absolute_cwd(arg_directory);
1329 free(arg_directory);
1332 arg_directory = get_current_dir_name();
1334 if (!arg_directory) {
1335 log_error("Failed to determine path, please use -D.");
1339 path_kill_slashes(arg_directory);
1342 arg_machine = strdup(basename(arg_directory));
1348 hostname_cleanup(arg_machine, false);
1349 if (isempty(arg_machine)) {
1350 log_error("Failed to determine machine name automatically, please use -M.");
1355 if (geteuid() != 0) {
1356 log_error("Need to be root.");
1360 if (sd_booted() <= 0) {
1361 log_error("Not running on a systemd system.");
1365 if (path_equal(arg_directory, "/")) {
1366 log_error("Spawning container on root directory not supported.");
1370 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1371 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1376 n_fd_passed = sd_listen_fds(false);
1377 if (n_fd_passed > 0) {
1378 k = fdset_new_listen_fds(&fds, false);
1380 log_error("Failed to collect file descriptors: %s", strerror(-k));
1384 fdset_close_others(fds);
1387 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1389 log_error("Failed to acquire pseudo tty: %m");
1393 console = ptsname(master);
1395 log_error("Failed to determine tty name: %m");
1400 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1402 if (unlockpt(master) < 0) {
1403 log_error("Failed to unlock tty: %m");
1408 if (access("/dev/kdbus/control", F_OK) >= 0) {
1410 if (arg_share_system) {
1411 kdbus_domain = strdup("/dev/kdbus");
1412 if (!kdbus_domain) {
1419 ns = strappenda("machine-", arg_machine);
1420 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1422 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1424 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1428 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1429 log_error("Failed to create kmsg socket pair: %m");
1433 sd_notify(0, "READY=1");
1435 assert_se(sigemptyset(&mask) == 0);
1436 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1437 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1442 sync_fd = eventfd(0, EFD_CLOEXEC);
1444 log_error("Failed to create event fd: %m");
1448 pid = syscall(__NR_clone,
1449 SIGCHLD|CLONE_NEWNS|
1450 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1451 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1453 if (errno == EINVAL)
1454 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1456 log_error("clone() failed: %m");
1463 const char *home = NULL;
1464 uid_t uid = (uid_t) -1;
1465 gid_t gid = (gid_t) -1;
1467 const char *envp[] = {
1468 "PATH=" DEFAULT_PATH_SPLIT_USR,
1469 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1474 NULL, /* container_uuid */
1475 NULL, /* LISTEN_FDS */
1476 NULL, /* LISTEN_PID */
1482 envp[n_env] = strv_find_prefix(environ, "TERM=");
1486 close_nointr_nofail(master);
1489 close_nointr(STDIN_FILENO);
1490 close_nointr(STDOUT_FILENO);
1491 close_nointr(STDERR_FILENO);
1493 close_nointr_nofail(kmsg_socket_pair[0]);
1494 kmsg_socket_pair[0] = -1;
1496 reset_all_signal_handlers();
1498 assert_se(sigemptyset(&mask) == 0);
1499 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1501 k = open_terminal(console, O_RDWR);
1502 if (k != STDIN_FILENO) {
1504 close_nointr_nofail(k);
1508 log_error("Failed to open console: %s", strerror(-k));
1512 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1513 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1514 log_error("Failed to duplicate console: %m");
1519 log_error("setsid() failed: %m");
1523 if (reset_audit_loginuid() < 0)
1526 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1527 log_error("PR_SET_PDEATHSIG failed: %m");
1531 /* Mark everything as slave, so that we still
1532 * receive mounts from the real root, but don't
1533 * propagate mounts to the real root. */
1534 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1535 log_error("MS_SLAVE|MS_REC failed: %m");
1539 /* Turn directory into bind mount */
1540 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1541 log_error("Failed to make bind mount.");
1546 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1547 log_error("Failed to make read-only.");
1551 if (mount_all(arg_directory) < 0)
1554 if (copy_devnodes(arg_directory) < 0)
1557 if (setup_ptmx(arg_directory) < 0)
1560 dev_setup(arg_directory);
1562 if (setup_dev_console(arg_directory, console) < 0)
1565 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1568 close_nointr_nofail(kmsg_socket_pair[1]);
1569 kmsg_socket_pair[1] = -1;
1571 if (setup_boot_id(arg_directory) < 0)
1574 if (setup_timezone(arg_directory) < 0)
1577 if (setup_resolv_conf(arg_directory) < 0)
1580 if (setup_journal(arg_directory) < 0)
1583 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1586 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1589 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1592 if (chdir(arg_directory) < 0) {
1593 log_error("chdir(%s) failed: %m", arg_directory);
1597 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1598 log_error("mount(MS_MOVE) failed: %m");
1602 if (chroot(".") < 0) {
1603 log_error("chroot() failed: %m");
1607 if (chdir("/") < 0) {
1608 log_error("chdir() failed: %m");
1614 if (arg_private_network)
1617 if (drop_capabilities() < 0) {
1618 log_error("drop_capabilities() failed: %m");
1624 /* Note that this resolves user names
1625 * inside the container, and hence
1626 * accesses the NSS modules from the
1627 * container and not the host. This is
1630 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1631 log_error("get_user_creds() failed: %m");
1635 if (mkdir_parents_label(home, 0775) < 0) {
1636 log_error("mkdir_parents_label() failed: %m");
1640 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1641 log_error("mkdir_safe_label() failed: %m");
1645 if (initgroups((const char*)arg_user, gid) < 0) {
1646 log_error("initgroups() failed: %m");
1650 if (setresgid(gid, gid, gid) < 0) {
1651 log_error("setregid() failed: %m");
1655 if (setresuid(uid, uid, uid) < 0) {
1656 log_error("setreuid() failed: %m");
1660 /* Reset everything fully to 0, just in case */
1662 if (setgroups(0, NULL) < 0) {
1663 log_error("setgroups() failed: %m");
1667 if (setresgid(0, 0, 0) < 0) {
1668 log_error("setregid() failed: %m");
1672 if (setresuid(0, 0, 0) < 0) {
1673 log_error("setreuid() failed: %m");
1678 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1679 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1680 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1685 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1686 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1692 if (fdset_size(fds) > 0) {
1693 k = fdset_cloexec(fds, false);
1695 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1699 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1700 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1708 eventfd_read(sync_fd, &x);
1709 close_nointr_nofail(sync_fd);
1712 if (!strv_isempty(arg_setenv)) {
1715 n = strv_env_merge(2, envp, arg_setenv);
1723 env_use = (char**) envp;
1726 if (arg_selinux_context)
1727 if (setexeccon(arg_selinux_context) < 0)
1728 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1734 /* Automatically search for the init system */
1736 l = 1 + argc - optind;
1737 a = newa(char*, l + 1);
1738 memcpy(a + 1, argv + optind, l * sizeof(char*));
1740 a[0] = (char*) "/usr/lib/systemd/systemd";
1741 execve(a[0], a, env_use);
1743 a[0] = (char*) "/lib/systemd/systemd";
1744 execve(a[0], a, env_use);
1746 a[0] = (char*) "/sbin/init";
1747 execve(a[0], a, env_use);
1748 } else if (argc > optind)
1749 execvpe(argv[optind], argv + optind, env_use);
1751 chdir(home ? home : "/root");
1752 execle("/bin/bash", "-bash", NULL, env_use);
1755 log_error("execv() failed: %m");
1758 _exit(EXIT_FAILURE);
1764 r = register_machine(pid);
1768 r = move_network_interfaces(pid);
1772 eventfd_write(sync_fd, 1);
1773 close_nointr_nofail(sync_fd);
1776 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1785 /* Kill if it is not dead yet anyway */
1786 terminate_machine(pid);
1788 /* Redundant, but better safe than sorry */
1791 k = wait_for_terminate(pid, &status);
1799 if (status.si_code == CLD_EXITED) {
1800 r = status.si_status;
1801 if (status.si_status != 0) {
1802 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1807 log_debug("Container %s exited successfully.", arg_machine);
1809 } else if (status.si_code == CLD_KILLED &&
1810 status.si_status == SIGINT) {
1813 log_info("Container %s has been shut down.", arg_machine);
1816 } else if (status.si_code == CLD_KILLED &&
1817 status.si_status == SIGHUP) {
1820 log_info("Container %s is being rebooted.", arg_machine);
1822 } else if (status.si_code == CLD_KILLED ||
1823 status.si_code == CLD_DUMPED) {
1825 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1829 log_error("Container %s failed due to unknown reason.", arg_machine);
1839 free(arg_directory);
1842 free(arg_network_interfaces);