1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
48 #include <selinux/selinux.h>
51 #include "sd-daemon.h"
61 #include "cgroup-util.h"
63 #include "path-util.h"
64 #include "loopback-setup.h"
65 #include "dev-setup.h"
70 #include "bus-error.h"
72 #include "bus-kernel.h"
75 #include "rtnl-util.h"
77 typedef enum LinkJournal {
84 static char *arg_directory = NULL;
85 static char *arg_user = NULL;
86 static sd_id128_t arg_uuid = {};
87 static char *arg_machine = NULL;
88 static char *arg_selinux_context = NULL;
89 static char *arg_selinux_apifs_context = NULL;
90 static const char *arg_slice = NULL;
91 static bool arg_private_network = false;
92 static bool arg_read_only = false;
93 static bool arg_boot = false;
94 static LinkJournal arg_link_journal = LINK_AUTO;
95 static uint64_t arg_retain =
97 (1ULL << CAP_DAC_OVERRIDE) |
98 (1ULL << CAP_DAC_READ_SEARCH) |
99 (1ULL << CAP_FOWNER) |
100 (1ULL << CAP_FSETID) |
101 (1ULL << CAP_IPC_OWNER) |
103 (1ULL << CAP_LEASE) |
104 (1ULL << CAP_LINUX_IMMUTABLE) |
105 (1ULL << CAP_NET_BIND_SERVICE) |
106 (1ULL << CAP_NET_BROADCAST) |
107 (1ULL << CAP_NET_RAW) |
108 (1ULL << CAP_SETGID) |
109 (1ULL << CAP_SETFCAP) |
110 (1ULL << CAP_SETPCAP) |
111 (1ULL << CAP_SETUID) |
112 (1ULL << CAP_SYS_ADMIN) |
113 (1ULL << CAP_SYS_CHROOT) |
114 (1ULL << CAP_SYS_NICE) |
115 (1ULL << CAP_SYS_PTRACE) |
116 (1ULL << CAP_SYS_TTY_CONFIG) |
117 (1ULL << CAP_SYS_RESOURCE) |
118 (1ULL << CAP_SYS_BOOT) |
119 (1ULL << CAP_AUDIT_WRITE) |
120 (1ULL << CAP_AUDIT_CONTROL) |
122 static char **arg_bind = NULL;
123 static char **arg_bind_ro = NULL;
124 static char **arg_setenv = NULL;
125 static bool arg_quiet = false;
126 static bool arg_share_system = false;
127 static bool arg_register = true;
128 static bool arg_keep_unit = false;
129 static char **arg_network_interfaces = NULL;
131 static int help(void) {
133 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
134 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
135 " -h --help Show this help\n"
136 " --version Print version string\n"
137 " -D --directory=NAME Root directory for the container\n"
138 " -b --boot Boot up full system (i.e. invoke init)\n"
139 " -u --user=USER Run the command under specified user or uid\n"
140 " --uuid=UUID Set a specific machine UUID for the container\n"
141 " -M --machine=NAME Set the machine name for the container\n"
142 " -S --slice=SLICE Place the container in the specified slice\n"
143 " -Z --selinux-context=SECLABEL\n"
144 " Set the SELinux security context to be used by\n"
145 " processes in the container\n"
146 " -L --selinux-apifs-context=SECLABEL\n"
147 " Set the SELinux security context to be used by\n"
148 " API/tmpfs file systems in the container\n"
149 " --private-network Disable network in container\n"
150 " --network-interface=INTERFACE\n"
151 " Assign an existing network interface to the container\n"
152 " --share-system Share system namespaces with host\n"
153 " --read-only Mount the root directory read-only\n"
154 " --capability=CAP In addition to the default, retain specified\n"
156 " --drop-capability=CAP Drop the specified capability from the default set\n"
157 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
158 " -j Equivalent to --link-journal=host\n"
159 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
161 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
162 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
163 " --register=BOOLEAN Register container as machine\n"
164 " --keep-unit Do not register a scope for the machine, reuse\n"
165 " the service unit nspawn is running in\n"
166 " -q --quiet Do not show status information\n",
167 program_invocation_short_name);
172 static int parse_argv(int argc, char *argv[]) {
188 ARG_NETWORK_INTERFACE
191 static const struct option options[] = {
192 { "help", no_argument, NULL, 'h' },
193 { "version", no_argument, NULL, ARG_VERSION },
194 { "directory", required_argument, NULL, 'D' },
195 { "user", required_argument, NULL, 'u' },
196 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
197 { "boot", no_argument, NULL, 'b' },
198 { "uuid", required_argument, NULL, ARG_UUID },
199 { "read-only", no_argument, NULL, ARG_READ_ONLY },
200 { "capability", required_argument, NULL, ARG_CAPABILITY },
201 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
202 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
203 { "bind", required_argument, NULL, ARG_BIND },
204 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
205 { "machine", required_argument, NULL, 'M' },
206 { "slice", required_argument, NULL, 'S' },
207 { "setenv", required_argument, NULL, ARG_SETENV },
208 { "selinux-context", required_argument, NULL, 'Z' },
209 { "selinux-apifs-context", required_argument, NULL, 'L' },
210 { "quiet", no_argument, NULL, 'q' },
211 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
212 { "register", required_argument, NULL, ARG_REGISTER },
213 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
214 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
219 uint64_t plus = 0, minus = 0;
224 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
232 puts(PACKAGE_STRING);
233 puts(SYSTEMD_FEATURES);
238 arg_directory = canonicalize_file_name(optarg);
239 if (!arg_directory) {
240 log_error("Invalid root directory: %m");
248 arg_user = strdup(optarg);
254 case ARG_NETWORK_INTERFACE:
255 if (strv_push(&arg_network_interfaces, optarg) < 0)
260 case ARG_PRIVATE_NETWORK:
261 arg_private_network = true;
269 r = sd_id128_from_string(optarg, &arg_uuid);
271 log_error("Invalid UUID: %s", optarg);
277 arg_slice = strdup(optarg);
284 if (isempty(optarg)) {
289 if (!hostname_is_valid(optarg)) {
290 log_error("Invalid machine name: %s", optarg);
295 arg_machine = strdup(optarg);
303 arg_selinux_context = optarg;
307 arg_selinux_apifs_context = optarg;
311 arg_read_only = true;
315 case ARG_DROP_CAPABILITY: {
319 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
320 _cleanup_free_ char *t;
323 t = strndup(word, length);
327 if (streq(t, "all")) {
328 if (c == ARG_CAPABILITY)
329 plus = (uint64_t) -1;
331 minus = (uint64_t) -1;
333 if (cap_from_name(t, &cap) < 0) {
334 log_error("Failed to parse capability %s.", t);
338 if (c == ARG_CAPABILITY)
339 plus |= 1ULL << (uint64_t) cap;
341 minus |= 1ULL << (uint64_t) cap;
349 arg_link_journal = LINK_GUEST;
352 case ARG_LINK_JOURNAL:
353 if (streq(optarg, "auto"))
354 arg_link_journal = LINK_AUTO;
355 else if (streq(optarg, "no"))
356 arg_link_journal = LINK_NO;
357 else if (streq(optarg, "guest"))
358 arg_link_journal = LINK_GUEST;
359 else if (streq(optarg, "host"))
360 arg_link_journal = LINK_HOST;
362 log_error("Failed to parse link journal mode %s", optarg);
370 _cleanup_free_ char *a = NULL, *b = NULL;
374 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
376 e = strchr(optarg, ':');
378 a = strndup(optarg, e - optarg);
388 if (!path_is_absolute(a) || !path_is_absolute(b)) {
389 log_error("Invalid bind mount specification: %s", optarg);
393 r = strv_extend(x, a);
397 r = strv_extend(x, b);
407 if (!env_assignment_is_valid(optarg)) {
408 log_error("Environment variable assignment '%s' is not valid.", optarg);
412 n = strv_env_set(arg_setenv, optarg);
416 strv_free(arg_setenv);
425 case ARG_SHARE_SYSTEM:
426 arg_share_system = true;
430 r = parse_boolean(optarg);
432 log_error("Failed to parse --register= argument: %s", optarg);
440 arg_keep_unit = true;
447 assert_not_reached("Unhandled option");
451 if (arg_share_system)
452 arg_register = false;
454 if (arg_boot && arg_share_system) {
455 log_error("--boot and --share-system may not be combined.");
459 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
460 log_error("--keep-unit may not be used when invoked from a user session.");
464 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
469 static int mount_all(const char *dest) {
471 typedef struct MountPoint {
480 static const MountPoint mount_table[] = {
481 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
482 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
483 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
484 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
485 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
486 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
487 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
488 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
490 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
491 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
498 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
499 _cleanup_free_ char *where = NULL;
501 _cleanup_free_ char *options = NULL;
506 where = strjoin(dest, "/", mount_table[k].where, NULL);
510 t = path_is_mount_point(where, true);
512 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
520 /* Skip this entry if it is not a remount. */
521 if (mount_table[k].what && t > 0)
524 mkdir_p(where, 0755);
527 if (arg_selinux_apifs_context &&
528 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
529 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
536 o = mount_table[k].options;
539 if (mount(mount_table[k].what,
542 mount_table[k].flags,
544 mount_table[k].fatal) {
546 log_error("mount(%s) failed: %m", where);
556 static int mount_binds(const char *dest, char **l, unsigned long flags) {
559 STRV_FOREACH_PAIR(x, y, l) {
561 struct stat source_st, dest_st;
564 if (stat(*x, &source_st) < 0) {
565 log_error("failed to stat %s: %m", *x);
569 where = strappenda(dest, *y);
570 r = stat(where, &dest_st);
572 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
573 log_error("The file types of %s and %s do not match. Refusing bind mount",
577 } else if (errno == ENOENT) {
578 r = mkdir_parents_label(where, 0755);
580 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
584 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
587 /* Create the mount point, but be conservative -- refuse to create block
588 * and char devices. */
589 if (S_ISDIR(source_st.st_mode))
590 mkdir_label(where, 0755);
591 else if (S_ISFIFO(source_st.st_mode))
593 else if (S_ISSOCK(source_st.st_mode))
594 mknod(where, 0644 | S_IFSOCK, 0);
595 else if (S_ISREG(source_st.st_mode))
598 log_error("Refusing to create mountpoint for file: %s", *x);
602 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
603 log_error("mount(%s) failed: %m", where);
607 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
608 log_error("mount(%s) failed: %m", where);
616 static int setup_timezone(const char *dest) {
617 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
623 /* Fix the timezone, if possible */
624 r = readlink_malloc("/etc/localtime", &p);
626 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
630 z = path_startswith(p, "../usr/share/zoneinfo/");
632 z = path_startswith(p, "/usr/share/zoneinfo/");
634 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
638 where = strappend(dest, "/etc/localtime");
642 r = readlink_malloc(where, &q);
644 y = path_startswith(q, "../usr/share/zoneinfo/");
646 y = path_startswith(q, "/usr/share/zoneinfo/");
649 /* Already pointing to the right place? Then do nothing .. */
650 if (y && streq(y, z))
654 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
658 if (access(check, F_OK) < 0) {
659 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
663 what = strappend("../usr/share/zoneinfo/", z);
668 if (symlink(what, where) < 0) {
669 log_error("Failed to correct timezone of container: %m");
676 static int setup_resolv_conf(const char *dest) {
677 char _cleanup_free_ *where = NULL;
681 if (arg_private_network)
684 /* Fix resolv.conf, if possible */
685 where = strappend(dest, "/etc/resolv.conf");
689 /* We don't really care for the results of this really. If it
690 * fails, it fails, but meh... */
691 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
696 static int setup_boot_id(const char *dest) {
697 _cleanup_free_ char *from = NULL, *to = NULL;
704 if (arg_share_system)
707 /* Generate a new randomized boot ID, so that each boot-up of
708 * the container gets a new one */
710 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
711 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
715 r = sd_id128_randomize(&rnd);
717 log_error("Failed to generate random boot id: %s", strerror(-r));
721 snprintf(as_uuid, sizeof(as_uuid),
722 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
723 SD_ID128_FORMAT_VAL(rnd));
724 char_array_0(as_uuid);
726 r = write_string_file(from, as_uuid);
728 log_error("Failed to write boot id: %s", strerror(-r));
732 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
733 log_error("Failed to bind mount boot id: %m");
735 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
736 log_warning("Failed to make boot id read-only: %m");
742 static int copy_devnodes(const char *dest) {
744 static const char devnodes[] =
754 _cleanup_umask_ mode_t u;
760 NULSTR_FOREACH(d, devnodes) {
761 _cleanup_free_ char *from = NULL, *to = NULL;
764 from = strappend("/dev/", d);
765 to = strjoin(dest, "/dev/", d, NULL);
769 if (stat(from, &st) < 0) {
771 if (errno != ENOENT) {
772 log_error("Failed to stat %s: %m", from);
776 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
778 log_error("%s is not a char or block device, cannot copy", from);
781 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
783 log_error("mknod(%s) failed: %m", dest);
791 static int setup_ptmx(const char *dest) {
792 _cleanup_free_ char *p = NULL;
794 p = strappend(dest, "/dev/ptmx");
798 if (symlink("pts/ptmx", p) < 0) {
799 log_error("Failed to create /dev/ptmx symlink: %m");
806 static int setup_dev_console(const char *dest, const char *console) {
808 _cleanup_free_ char *to = NULL;
810 _cleanup_umask_ mode_t u;
817 if (stat(console, &st) < 0) {
818 log_error("Failed to stat %s: %m", console);
821 } else if (!S_ISCHR(st.st_mode)) {
822 log_error("/dev/console is not a char device");
826 r = chmod_and_chown(console, 0600, 0, 0);
828 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
832 if (asprintf(&to, "%s/dev/console", dest) < 0)
835 /* We need to bind mount the right tty to /dev/console since
836 * ptys can only exist on pts file systems. To have something
837 * to bind mount things on we create a device node first, that
838 * has the right major/minor (note that the major minor
839 * doesn't actually matter here, since we mount it over
842 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
843 log_error("mknod() for /dev/console failed: %m");
847 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
848 log_error("Bind mount for /dev/console failed: %m");
855 static int setup_kmsg(const char *dest, int kmsg_socket) {
856 _cleanup_free_ char *from = NULL, *to = NULL;
858 _cleanup_umask_ mode_t u;
860 struct cmsghdr cmsghdr;
861 uint8_t buf[CMSG_SPACE(sizeof(int))];
864 .msg_control = &control,
865 .msg_controllen = sizeof(control),
867 struct cmsghdr *cmsg;
870 assert(kmsg_socket >= 0);
874 /* We create the kmsg FIFO as /dev/kmsg, but immediately
875 * delete it after bind mounting it to /proc/kmsg. While FIFOs
876 * on the reading side behave very similar to /proc/kmsg,
877 * their writing side behaves differently from /dev/kmsg in
878 * that writing blocks when nothing is reading. In order to
879 * avoid any problems with containers deadlocking due to this
880 * we simply make /dev/kmsg unavailable to the container. */
881 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
882 asprintf(&to, "%s/proc/kmsg", dest) < 0)
885 if (mkfifo(from, 0600) < 0) {
886 log_error("mkfifo() for /dev/kmsg failed: %m");
890 r = chmod_and_chown(from, 0600, 0, 0);
892 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
896 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
897 log_error("Bind mount for /proc/kmsg failed: %m");
901 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
903 log_error("Failed to open fifo: %m");
907 cmsg = CMSG_FIRSTHDR(&mh);
908 cmsg->cmsg_level = SOL_SOCKET;
909 cmsg->cmsg_type = SCM_RIGHTS;
910 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
911 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
913 mh.msg_controllen = cmsg->cmsg_len;
915 /* Store away the fd in the socket, so that it stays open as
916 * long as we run the child */
917 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
918 close_nointr_nofail(fd);
921 log_error("Failed to send FIFO fd: %m");
925 /* And now make the FIFO unavailable as /dev/kmsg... */
930 static int setup_hostname(void) {
932 if (arg_share_system)
935 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
941 static int setup_journal(const char *directory) {
942 sd_id128_t machine_id, this_id;
943 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
947 p = strappend(directory, "/etc/machine-id");
951 r = read_one_line_file(p, &b);
952 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
955 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
960 if (isempty(id) && arg_link_journal == LINK_AUTO)
963 /* Verify validity */
964 r = sd_id128_from_string(id, &machine_id);
966 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
970 r = sd_id128_get_machine(&this_id);
972 log_error("Failed to retrieve machine ID: %s", strerror(-r));
976 if (sd_id128_equal(machine_id, this_id)) {
977 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
978 "Host and machine ids are equal (%s): refusing to link journals", id);
979 if (arg_link_journal == LINK_AUTO)
985 if (arg_link_journal == LINK_NO)
989 p = strappend("/var/log/journal/", id);
990 q = strjoin(directory, "/var/log/journal/", id, NULL);
994 if (path_is_mount_point(p, false) > 0) {
995 if (arg_link_journal != LINK_AUTO) {
996 log_error("%s: already a mount point, refusing to use for journal", p);
1003 if (path_is_mount_point(q, false) > 0) {
1004 if (arg_link_journal != LINK_AUTO) {
1005 log_error("%s: already a mount point, refusing to use for journal", q);
1012 r = readlink_and_make_absolute(p, &d);
1014 if ((arg_link_journal == LINK_GUEST ||
1015 arg_link_journal == LINK_AUTO) &&
1018 r = mkdir_p(q, 0755);
1020 log_warning("failed to create directory %s: %m", q);
1024 if (unlink(p) < 0) {
1025 log_error("Failed to remove symlink %s: %m", p);
1028 } else if (r == -EINVAL) {
1030 if (arg_link_journal == LINK_GUEST &&
1033 if (errno == ENOTDIR) {
1034 log_error("%s already exists and is neither a symlink nor a directory", p);
1037 log_error("Failed to remove %s: %m", p);
1041 } else if (r != -ENOENT) {
1042 log_error("readlink(%s) failed: %m", p);
1046 if (arg_link_journal == LINK_GUEST) {
1048 if (symlink(q, p) < 0) {
1049 log_error("Failed to symlink %s to %s: %m", q, p);
1053 r = mkdir_p(q, 0755);
1055 log_warning("failed to create directory %s: %m", q);
1059 if (arg_link_journal == LINK_HOST) {
1060 r = mkdir_p(p, 0755);
1062 log_error("Failed to create %s: %m", p);
1066 } else if (access(p, F_OK) < 0)
1069 if (dir_is_empty(q) == 0) {
1070 log_error("%s not empty.", q);
1074 r = mkdir_p(q, 0755);
1076 log_error("Failed to create %s: %m", q);
1080 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1081 log_error("Failed to bind mount journal from host into guest: %m");
1088 static int setup_kdbus(const char *dest, const char *path) {
1094 p = strappenda(dest, "/dev/kdbus");
1095 if (mkdir(p, 0755) < 0) {
1096 log_error("Failed to create kdbus path: %m");
1100 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1101 log_error("Failed to mount kdbus domain path: %m");
1108 static int drop_capabilities(void) {
1109 return capability_bounding_set_drop(~arg_retain, false);
1112 static int register_machine(pid_t pid) {
1113 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1114 _cleanup_bus_unref_ sd_bus *bus = NULL;
1120 r = sd_bus_default_system(&bus);
1122 log_error("Failed to open system bus: %s", strerror(-r));
1126 if (arg_keep_unit) {
1127 r = sd_bus_call_method(
1129 "org.freedesktop.machine1",
1130 "/org/freedesktop/machine1",
1131 "org.freedesktop.machine1.Manager",
1137 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1141 strempty(arg_directory));
1143 r = sd_bus_call_method(
1145 "org.freedesktop.machine1",
1146 "/org/freedesktop/machine1",
1147 "org.freedesktop.machine1.Manager",
1153 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1157 strempty(arg_directory),
1158 !isempty(arg_slice), "Slice", "s", arg_slice);
1162 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1169 static int terminate_machine(pid_t pid) {
1170 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1171 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1172 _cleanup_bus_unref_ sd_bus *bus = NULL;
1179 r = sd_bus_default_system(&bus);
1181 log_error("Failed to open system bus: %s", strerror(-r));
1185 r = sd_bus_call_method(
1187 "org.freedesktop.machine1",
1188 "/org/freedesktop/machine1",
1189 "org.freedesktop.machine1.Manager",
1196 /* Note that the machine might already have been
1197 * cleaned up automatically, hence don't consider it a
1198 * failure if we cannot get the machine object. */
1199 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1203 r = sd_bus_message_read(reply, "o", &path);
1205 return bus_log_parse_error(r);
1207 r = sd_bus_call_method(
1209 "org.freedesktop.machine1",
1211 "org.freedesktop.machine1.Machine",
1217 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1224 static int reset_audit_loginuid(void) {
1225 _cleanup_free_ char *p = NULL;
1228 if (arg_share_system)
1231 r = read_one_line_file("/proc/self/loginuid", &p);
1235 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1239 /* Already reset? */
1240 if (streq(p, "4294967295"))
1243 r = write_string_file("/proc/self/loginuid", "4294967295");
1245 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1246 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1247 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1248 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1249 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1257 static int move_network_interfaces(pid_t pid) {
1258 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1262 if (!arg_private_network)
1265 if (strv_isempty(arg_network_interfaces))
1268 r = sd_rtnl_open(NETLINK_ROUTE, &rtnl);
1270 log_error("Failed to connect to netlink: %s", strerror(-r));
1274 STRV_FOREACH(i, arg_network_interfaces) {
1275 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1278 ifi = if_nametoindex(*i);
1280 log_error("Failed to resolve interface %s: %m", *i);
1284 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1286 log_error("Failed to allocate netlink message: %s", strerror(-r));
1290 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1292 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1296 r = sd_rtnl_call(rtnl, m, 0, NULL);
1298 log_error("Failed to move interface to namespace: %s", strerror(-r));
1306 int main(int argc, char *argv[]) {
1308 int r = EXIT_FAILURE, k;
1309 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1311 const char *console = NULL;
1313 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1314 _cleanup_fdset_free_ FDSet *fds = NULL;
1315 _cleanup_free_ char *kdbus_domain = NULL;
1317 log_parse_environment();
1320 k = parse_argv(argc, argv);
1328 if (arg_directory) {
1331 p = path_make_absolute_cwd(arg_directory);
1332 free(arg_directory);
1335 arg_directory = get_current_dir_name();
1337 if (!arg_directory) {
1338 log_error("Failed to determine path, please use -D.");
1342 path_kill_slashes(arg_directory);
1345 arg_machine = strdup(basename(arg_directory));
1351 hostname_cleanup(arg_machine, false);
1352 if (isempty(arg_machine)) {
1353 log_error("Failed to determine machine name automatically, please use -M.");
1358 if (geteuid() != 0) {
1359 log_error("Need to be root.");
1363 if (sd_booted() <= 0) {
1364 log_error("Not running on a systemd system.");
1368 if (path_equal(arg_directory, "/")) {
1369 log_error("Spawning container on root directory not supported.");
1373 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1374 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1379 n_fd_passed = sd_listen_fds(false);
1380 if (n_fd_passed > 0) {
1381 k = fdset_new_listen_fds(&fds, false);
1383 log_error("Failed to collect file descriptors: %s", strerror(-k));
1387 fdset_close_others(fds);
1390 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1392 log_error("Failed to acquire pseudo tty: %m");
1396 console = ptsname(master);
1398 log_error("Failed to determine tty name: %m");
1403 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1405 if (unlockpt(master) < 0) {
1406 log_error("Failed to unlock tty: %m");
1411 if (access("/dev/kdbus/control", F_OK) >= 0) {
1413 if (arg_share_system) {
1414 kdbus_domain = strdup("/dev/kdbus");
1415 if (!kdbus_domain) {
1422 ns = strappenda("machine-", arg_machine);
1423 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1425 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1427 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1431 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1432 log_error("Failed to create kmsg socket pair: %m");
1436 sd_notify(0, "READY=1");
1438 assert_se(sigemptyset(&mask) == 0);
1439 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1440 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1445 sync_fd = eventfd(0, EFD_CLOEXEC);
1447 log_error("Failed to create event fd: %m");
1451 pid = syscall(__NR_clone,
1452 SIGCHLD|CLONE_NEWNS|
1453 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1454 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1456 if (errno == EINVAL)
1457 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1459 log_error("clone() failed: %m");
1466 const char *home = NULL;
1467 uid_t uid = (uid_t) -1;
1468 gid_t gid = (gid_t) -1;
1470 const char *envp[] = {
1471 "PATH=" DEFAULT_PATH_SPLIT_USR,
1472 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1477 NULL, /* container_uuid */
1478 NULL, /* LISTEN_FDS */
1479 NULL, /* LISTEN_PID */
1485 envp[n_env] = strv_find_prefix(environ, "TERM=");
1489 close_nointr_nofail(master);
1492 close_nointr(STDIN_FILENO);
1493 close_nointr(STDOUT_FILENO);
1494 close_nointr(STDERR_FILENO);
1496 close_nointr_nofail(kmsg_socket_pair[0]);
1497 kmsg_socket_pair[0] = -1;
1499 reset_all_signal_handlers();
1501 assert_se(sigemptyset(&mask) == 0);
1502 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1504 k = open_terminal(console, O_RDWR);
1505 if (k != STDIN_FILENO) {
1507 close_nointr_nofail(k);
1511 log_error("Failed to open console: %s", strerror(-k));
1515 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1516 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1517 log_error("Failed to duplicate console: %m");
1522 log_error("setsid() failed: %m");
1526 if (reset_audit_loginuid() < 0)
1529 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1530 log_error("PR_SET_PDEATHSIG failed: %m");
1534 /* Mark everything as slave, so that we still
1535 * receive mounts from the real root, but don't
1536 * propagate mounts to the real root. */
1537 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1538 log_error("MS_SLAVE|MS_REC failed: %m");
1542 /* Turn directory into bind mount */
1543 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1544 log_error("Failed to make bind mount.");
1549 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1550 log_error("Failed to make read-only.");
1554 if (mount_all(arg_directory) < 0)
1557 if (copy_devnodes(arg_directory) < 0)
1560 if (setup_ptmx(arg_directory) < 0)
1563 dev_setup(arg_directory);
1565 if (setup_dev_console(arg_directory, console) < 0)
1568 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1571 close_nointr_nofail(kmsg_socket_pair[1]);
1572 kmsg_socket_pair[1] = -1;
1574 if (setup_boot_id(arg_directory) < 0)
1577 if (setup_timezone(arg_directory) < 0)
1580 if (setup_resolv_conf(arg_directory) < 0)
1583 if (setup_journal(arg_directory) < 0)
1586 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1589 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1592 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1595 if (chdir(arg_directory) < 0) {
1596 log_error("chdir(%s) failed: %m", arg_directory);
1600 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1601 log_error("mount(MS_MOVE) failed: %m");
1605 if (chroot(".") < 0) {
1606 log_error("chroot() failed: %m");
1610 if (chdir("/") < 0) {
1611 log_error("chdir() failed: %m");
1617 if (arg_private_network)
1620 if (drop_capabilities() < 0) {
1621 log_error("drop_capabilities() failed: %m");
1627 /* Note that this resolves user names
1628 * inside the container, and hence
1629 * accesses the NSS modules from the
1630 * container and not the host. This is
1633 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1634 log_error("get_user_creds() failed: %m");
1638 if (mkdir_parents_label(home, 0775) < 0) {
1639 log_error("mkdir_parents_label() failed: %m");
1643 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1644 log_error("mkdir_safe_label() failed: %m");
1648 if (initgroups((const char*)arg_user, gid) < 0) {
1649 log_error("initgroups() failed: %m");
1653 if (setresgid(gid, gid, gid) < 0) {
1654 log_error("setregid() failed: %m");
1658 if (setresuid(uid, uid, uid) < 0) {
1659 log_error("setreuid() failed: %m");
1663 /* Reset everything fully to 0, just in case */
1665 if (setgroups(0, NULL) < 0) {
1666 log_error("setgroups() failed: %m");
1670 if (setresgid(0, 0, 0) < 0) {
1671 log_error("setregid() failed: %m");
1675 if (setresuid(0, 0, 0) < 0) {
1676 log_error("setreuid() failed: %m");
1681 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1682 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1683 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1688 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1689 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1695 if (fdset_size(fds) > 0) {
1696 k = fdset_cloexec(fds, false);
1698 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1702 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1703 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1711 eventfd_read(sync_fd, &x);
1712 close_nointr_nofail(sync_fd);
1715 if (!strv_isempty(arg_setenv)) {
1718 n = strv_env_merge(2, envp, arg_setenv);
1726 env_use = (char**) envp;
1729 if (arg_selinux_context)
1730 if (setexeccon(arg_selinux_context) < 0)
1731 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1737 /* Automatically search for the init system */
1739 l = 1 + argc - optind;
1740 a = newa(char*, l + 1);
1741 memcpy(a + 1, argv + optind, l * sizeof(char*));
1743 a[0] = (char*) "/usr/lib/systemd/systemd";
1744 execve(a[0], a, env_use);
1746 a[0] = (char*) "/lib/systemd/systemd";
1747 execve(a[0], a, env_use);
1749 a[0] = (char*) "/sbin/init";
1750 execve(a[0], a, env_use);
1751 } else if (argc > optind)
1752 execvpe(argv[optind], argv + optind, env_use);
1754 chdir(home ? home : "/root");
1755 execle("/bin/bash", "-bash", NULL, env_use);
1758 log_error("execv() failed: %m");
1761 _exit(EXIT_FAILURE);
1767 r = register_machine(pid);
1771 r = move_network_interfaces(pid);
1775 eventfd_write(sync_fd, 1);
1776 close_nointr_nofail(sync_fd);
1779 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1788 /* Kill if it is not dead yet anyway */
1789 terminate_machine(pid);
1791 /* Redundant, but better safe than sorry */
1794 k = wait_for_terminate(pid, &status);
1802 if (status.si_code == CLD_EXITED) {
1803 r = status.si_status;
1804 if (status.si_status != 0) {
1805 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1810 log_debug("Container %s exited successfully.", arg_machine);
1812 } else if (status.si_code == CLD_KILLED &&
1813 status.si_status == SIGINT) {
1816 log_info("Container %s has been shut down.", arg_machine);
1819 } else if (status.si_code == CLD_KILLED &&
1820 status.si_status == SIGHUP) {
1823 log_info("Container %s is being rebooted.", arg_machine);
1825 } else if (status.si_code == CLD_KILLED ||
1826 status.si_code == CLD_DUMPED) {
1828 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1832 log_error("Container %s failed due to unknown reason.", arg_machine);
1842 free(arg_directory);
1845 free(arg_network_interfaces);