1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
48 #include <selinux/selinux.h>
51 #include "sd-daemon.h"
61 #include "cgroup-util.h"
63 #include "path-util.h"
64 #include "loopback-setup.h"
65 #include "dev-setup.h"
70 #include "bus-error.h"
72 #include "bus-kernel.h"
75 #include "rtnl-util.h"
76 #include "udev-util.h"
78 typedef enum LinkJournal {
85 static char *arg_directory = NULL;
86 static char *arg_user = NULL;
87 static sd_id128_t arg_uuid = {};
88 static char *arg_machine = NULL;
89 static char *arg_selinux_context = NULL;
90 static char *arg_selinux_apifs_context = NULL;
91 static const char *arg_slice = NULL;
92 static bool arg_private_network = false;
93 static bool arg_read_only = false;
94 static bool arg_boot = false;
95 static LinkJournal arg_link_journal = LINK_AUTO;
96 static uint64_t arg_retain =
98 (1ULL << CAP_DAC_OVERRIDE) |
99 (1ULL << CAP_DAC_READ_SEARCH) |
100 (1ULL << CAP_FOWNER) |
101 (1ULL << CAP_FSETID) |
102 (1ULL << CAP_IPC_OWNER) |
104 (1ULL << CAP_LEASE) |
105 (1ULL << CAP_LINUX_IMMUTABLE) |
106 (1ULL << CAP_NET_BIND_SERVICE) |
107 (1ULL << CAP_NET_BROADCAST) |
108 (1ULL << CAP_NET_RAW) |
109 (1ULL << CAP_SETGID) |
110 (1ULL << CAP_SETFCAP) |
111 (1ULL << CAP_SETPCAP) |
112 (1ULL << CAP_SETUID) |
113 (1ULL << CAP_SYS_ADMIN) |
114 (1ULL << CAP_SYS_CHROOT) |
115 (1ULL << CAP_SYS_NICE) |
116 (1ULL << CAP_SYS_PTRACE) |
117 (1ULL << CAP_SYS_TTY_CONFIG) |
118 (1ULL << CAP_SYS_RESOURCE) |
119 (1ULL << CAP_SYS_BOOT) |
120 (1ULL << CAP_AUDIT_WRITE) |
121 (1ULL << CAP_AUDIT_CONTROL) |
123 static char **arg_bind = NULL;
124 static char **arg_bind_ro = NULL;
125 static char **arg_setenv = NULL;
126 static bool arg_quiet = false;
127 static bool arg_share_system = false;
128 static bool arg_register = true;
129 static bool arg_keep_unit = false;
130 static char **arg_network_interfaces = NULL;
132 static int help(void) {
134 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
135 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
136 " -h --help Show this help\n"
137 " --version Print version string\n"
138 " -D --directory=NAME Root directory for the container\n"
139 " -b --boot Boot up full system (i.e. invoke init)\n"
140 " -u --user=USER Run the command under specified user or uid\n"
141 " --uuid=UUID Set a specific machine UUID for the container\n"
142 " -M --machine=NAME Set the machine name for the container\n"
143 " -S --slice=SLICE Place the container in the specified slice\n"
144 " -Z --selinux-context=SECLABEL\n"
145 " Set the SELinux security context to be used by\n"
146 " processes in the container\n"
147 " -L --selinux-apifs-context=SECLABEL\n"
148 " Set the SELinux security context to be used by\n"
149 " API/tmpfs file systems in the container\n"
150 " --private-network Disable network in container\n"
151 " --network-interface=INTERFACE\n"
152 " Assign an existing network interface to the container\n"
153 " --share-system Share system namespaces with host\n"
154 " --read-only Mount the root directory read-only\n"
155 " --capability=CAP In addition to the default, retain specified\n"
157 " --drop-capability=CAP Drop the specified capability from the default set\n"
158 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
159 " -j Equivalent to --link-journal=host\n"
160 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
162 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
163 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
164 " --register=BOOLEAN Register container as machine\n"
165 " --keep-unit Do not register a scope for the machine, reuse\n"
166 " the service unit nspawn is running in\n"
167 " -q --quiet Do not show status information\n",
168 program_invocation_short_name);
173 static int parse_argv(int argc, char *argv[]) {
189 ARG_NETWORK_INTERFACE
192 static const struct option options[] = {
193 { "help", no_argument, NULL, 'h' },
194 { "version", no_argument, NULL, ARG_VERSION },
195 { "directory", required_argument, NULL, 'D' },
196 { "user", required_argument, NULL, 'u' },
197 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
198 { "boot", no_argument, NULL, 'b' },
199 { "uuid", required_argument, NULL, ARG_UUID },
200 { "read-only", no_argument, NULL, ARG_READ_ONLY },
201 { "capability", required_argument, NULL, ARG_CAPABILITY },
202 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
203 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
204 { "bind", required_argument, NULL, ARG_BIND },
205 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
206 { "machine", required_argument, NULL, 'M' },
207 { "slice", required_argument, NULL, 'S' },
208 { "setenv", required_argument, NULL, ARG_SETENV },
209 { "selinux-context", required_argument, NULL, 'Z' },
210 { "selinux-apifs-context", required_argument, NULL, 'L' },
211 { "quiet", no_argument, NULL, 'q' },
212 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
213 { "register", required_argument, NULL, ARG_REGISTER },
214 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
215 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
220 uint64_t plus = 0, minus = 0;
225 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
233 puts(PACKAGE_STRING);
234 puts(SYSTEMD_FEATURES);
239 arg_directory = canonicalize_file_name(optarg);
240 if (!arg_directory) {
241 log_error("Invalid root directory: %m");
249 arg_user = strdup(optarg);
255 case ARG_NETWORK_INTERFACE:
256 if (strv_push(&arg_network_interfaces, optarg) < 0)
261 case ARG_PRIVATE_NETWORK:
262 arg_private_network = true;
270 r = sd_id128_from_string(optarg, &arg_uuid);
272 log_error("Invalid UUID: %s", optarg);
278 arg_slice = strdup(optarg);
285 if (isempty(optarg)) {
290 if (!hostname_is_valid(optarg)) {
291 log_error("Invalid machine name: %s", optarg);
296 arg_machine = strdup(optarg);
304 arg_selinux_context = optarg;
308 arg_selinux_apifs_context = optarg;
312 arg_read_only = true;
316 case ARG_DROP_CAPABILITY: {
320 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
321 _cleanup_free_ char *t;
324 t = strndup(word, length);
328 if (streq(t, "all")) {
329 if (c == ARG_CAPABILITY)
330 plus = (uint64_t) -1;
332 minus = (uint64_t) -1;
334 if (cap_from_name(t, &cap) < 0) {
335 log_error("Failed to parse capability %s.", t);
339 if (c == ARG_CAPABILITY)
340 plus |= 1ULL << (uint64_t) cap;
342 minus |= 1ULL << (uint64_t) cap;
350 arg_link_journal = LINK_GUEST;
353 case ARG_LINK_JOURNAL:
354 if (streq(optarg, "auto"))
355 arg_link_journal = LINK_AUTO;
356 else if (streq(optarg, "no"))
357 arg_link_journal = LINK_NO;
358 else if (streq(optarg, "guest"))
359 arg_link_journal = LINK_GUEST;
360 else if (streq(optarg, "host"))
361 arg_link_journal = LINK_HOST;
363 log_error("Failed to parse link journal mode %s", optarg);
371 _cleanup_free_ char *a = NULL, *b = NULL;
375 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
377 e = strchr(optarg, ':');
379 a = strndup(optarg, e - optarg);
389 if (!path_is_absolute(a) || !path_is_absolute(b)) {
390 log_error("Invalid bind mount specification: %s", optarg);
394 r = strv_extend(x, a);
398 r = strv_extend(x, b);
408 if (!env_assignment_is_valid(optarg)) {
409 log_error("Environment variable assignment '%s' is not valid.", optarg);
413 n = strv_env_set(arg_setenv, optarg);
417 strv_free(arg_setenv);
426 case ARG_SHARE_SYSTEM:
427 arg_share_system = true;
431 r = parse_boolean(optarg);
433 log_error("Failed to parse --register= argument: %s", optarg);
441 arg_keep_unit = true;
448 assert_not_reached("Unhandled option");
452 if (arg_share_system)
453 arg_register = false;
455 if (arg_boot && arg_share_system) {
456 log_error("--boot and --share-system may not be combined.");
460 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
461 log_error("--keep-unit may not be used when invoked from a user session.");
465 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
470 static int mount_all(const char *dest) {
472 typedef struct MountPoint {
481 static const MountPoint mount_table[] = {
482 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
483 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
484 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
485 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
486 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
487 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
488 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
489 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
491 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
492 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
499 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
500 _cleanup_free_ char *where = NULL;
502 _cleanup_free_ char *options = NULL;
507 where = strjoin(dest, "/", mount_table[k].where, NULL);
511 t = path_is_mount_point(where, true);
513 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
521 /* Skip this entry if it is not a remount. */
522 if (mount_table[k].what && t > 0)
525 mkdir_p(where, 0755);
528 if (arg_selinux_apifs_context &&
529 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
530 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
537 o = mount_table[k].options;
540 if (mount(mount_table[k].what,
543 mount_table[k].flags,
545 mount_table[k].fatal) {
547 log_error("mount(%s) failed: %m", where);
557 static int mount_binds(const char *dest, char **l, unsigned long flags) {
560 STRV_FOREACH_PAIR(x, y, l) {
562 struct stat source_st, dest_st;
565 if (stat(*x, &source_st) < 0) {
566 log_error("failed to stat %s: %m", *x);
570 where = strappenda(dest, *y);
571 r = stat(where, &dest_st);
573 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
574 log_error("The file types of %s and %s do not match. Refusing bind mount",
578 } else if (errno == ENOENT) {
579 r = mkdir_parents_label(where, 0755);
581 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
585 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
588 /* Create the mount point, but be conservative -- refuse to create block
589 * and char devices. */
590 if (S_ISDIR(source_st.st_mode))
591 mkdir_label(where, 0755);
592 else if (S_ISFIFO(source_st.st_mode))
594 else if (S_ISSOCK(source_st.st_mode))
595 mknod(where, 0644 | S_IFSOCK, 0);
596 else if (S_ISREG(source_st.st_mode))
599 log_error("Refusing to create mountpoint for file: %s", *x);
603 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
604 log_error("mount(%s) failed: %m", where);
608 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
609 log_error("mount(%s) failed: %m", where);
617 static int setup_timezone(const char *dest) {
618 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
624 /* Fix the timezone, if possible */
625 r = readlink_malloc("/etc/localtime", &p);
627 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
631 z = path_startswith(p, "../usr/share/zoneinfo/");
633 z = path_startswith(p, "/usr/share/zoneinfo/");
635 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
639 where = strappend(dest, "/etc/localtime");
643 r = readlink_malloc(where, &q);
645 y = path_startswith(q, "../usr/share/zoneinfo/");
647 y = path_startswith(q, "/usr/share/zoneinfo/");
650 /* Already pointing to the right place? Then do nothing .. */
651 if (y && streq(y, z))
655 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
659 if (access(check, F_OK) < 0) {
660 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
664 what = strappend("../usr/share/zoneinfo/", z);
669 if (symlink(what, where) < 0) {
670 log_error("Failed to correct timezone of container: %m");
677 static int setup_resolv_conf(const char *dest) {
678 char _cleanup_free_ *where = NULL;
682 if (arg_private_network)
685 /* Fix resolv.conf, if possible */
686 where = strappend(dest, "/etc/resolv.conf");
690 /* We don't really care for the results of this really. If it
691 * fails, it fails, but meh... */
692 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
697 static int setup_boot_id(const char *dest) {
698 _cleanup_free_ char *from = NULL, *to = NULL;
705 if (arg_share_system)
708 /* Generate a new randomized boot ID, so that each boot-up of
709 * the container gets a new one */
711 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
712 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
716 r = sd_id128_randomize(&rnd);
718 log_error("Failed to generate random boot id: %s", strerror(-r));
722 snprintf(as_uuid, sizeof(as_uuid),
723 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
724 SD_ID128_FORMAT_VAL(rnd));
725 char_array_0(as_uuid);
727 r = write_string_file(from, as_uuid);
729 log_error("Failed to write boot id: %s", strerror(-r));
733 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
734 log_error("Failed to bind mount boot id: %m");
736 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
737 log_warning("Failed to make boot id read-only: %m");
743 static int copy_devnodes(const char *dest) {
745 static const char devnodes[] =
755 _cleanup_umask_ mode_t u;
761 NULSTR_FOREACH(d, devnodes) {
762 _cleanup_free_ char *from = NULL, *to = NULL;
765 from = strappend("/dev/", d);
766 to = strjoin(dest, "/dev/", d, NULL);
770 if (stat(from, &st) < 0) {
772 if (errno != ENOENT) {
773 log_error("Failed to stat %s: %m", from);
777 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
779 log_error("%s is not a char or block device, cannot copy", from);
782 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
784 log_error("mknod(%s) failed: %m", dest);
792 static int setup_ptmx(const char *dest) {
793 _cleanup_free_ char *p = NULL;
795 p = strappend(dest, "/dev/ptmx");
799 if (symlink("pts/ptmx", p) < 0) {
800 log_error("Failed to create /dev/ptmx symlink: %m");
807 static int setup_dev_console(const char *dest, const char *console) {
809 _cleanup_free_ char *to = NULL;
811 _cleanup_umask_ mode_t u;
818 if (stat(console, &st) < 0) {
819 log_error("Failed to stat %s: %m", console);
822 } else if (!S_ISCHR(st.st_mode)) {
823 log_error("/dev/console is not a char device");
827 r = chmod_and_chown(console, 0600, 0, 0);
829 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
833 if (asprintf(&to, "%s/dev/console", dest) < 0)
836 /* We need to bind mount the right tty to /dev/console since
837 * ptys can only exist on pts file systems. To have something
838 * to bind mount things on we create a device node first, that
839 * has the right major/minor (note that the major minor
840 * doesn't actually matter here, since we mount it over
843 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
844 log_error("mknod() for /dev/console failed: %m");
848 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
849 log_error("Bind mount for /dev/console failed: %m");
856 static int setup_kmsg(const char *dest, int kmsg_socket) {
857 _cleanup_free_ char *from = NULL, *to = NULL;
859 _cleanup_umask_ mode_t u;
861 struct cmsghdr cmsghdr;
862 uint8_t buf[CMSG_SPACE(sizeof(int))];
865 .msg_control = &control,
866 .msg_controllen = sizeof(control),
868 struct cmsghdr *cmsg;
871 assert(kmsg_socket >= 0);
875 /* We create the kmsg FIFO as /dev/kmsg, but immediately
876 * delete it after bind mounting it to /proc/kmsg. While FIFOs
877 * on the reading side behave very similar to /proc/kmsg,
878 * their writing side behaves differently from /dev/kmsg in
879 * that writing blocks when nothing is reading. In order to
880 * avoid any problems with containers deadlocking due to this
881 * we simply make /dev/kmsg unavailable to the container. */
882 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
883 asprintf(&to, "%s/proc/kmsg", dest) < 0)
886 if (mkfifo(from, 0600) < 0) {
887 log_error("mkfifo() for /dev/kmsg failed: %m");
891 r = chmod_and_chown(from, 0600, 0, 0);
893 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
897 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
898 log_error("Bind mount for /proc/kmsg failed: %m");
902 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
904 log_error("Failed to open fifo: %m");
908 cmsg = CMSG_FIRSTHDR(&mh);
909 cmsg->cmsg_level = SOL_SOCKET;
910 cmsg->cmsg_type = SCM_RIGHTS;
911 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
912 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
914 mh.msg_controllen = cmsg->cmsg_len;
916 /* Store away the fd in the socket, so that it stays open as
917 * long as we run the child */
918 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
919 close_nointr_nofail(fd);
922 log_error("Failed to send FIFO fd: %m");
926 /* And now make the FIFO unavailable as /dev/kmsg... */
931 static int setup_hostname(void) {
933 if (arg_share_system)
936 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
942 static int setup_journal(const char *directory) {
943 sd_id128_t machine_id, this_id;
944 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
948 p = strappend(directory, "/etc/machine-id");
952 r = read_one_line_file(p, &b);
953 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
956 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
961 if (isempty(id) && arg_link_journal == LINK_AUTO)
964 /* Verify validity */
965 r = sd_id128_from_string(id, &machine_id);
967 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
971 r = sd_id128_get_machine(&this_id);
973 log_error("Failed to retrieve machine ID: %s", strerror(-r));
977 if (sd_id128_equal(machine_id, this_id)) {
978 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
979 "Host and machine ids are equal (%s): refusing to link journals", id);
980 if (arg_link_journal == LINK_AUTO)
986 if (arg_link_journal == LINK_NO)
990 p = strappend("/var/log/journal/", id);
991 q = strjoin(directory, "/var/log/journal/", id, NULL);
995 if (path_is_mount_point(p, false) > 0) {
996 if (arg_link_journal != LINK_AUTO) {
997 log_error("%s: already a mount point, refusing to use for journal", p);
1004 if (path_is_mount_point(q, false) > 0) {
1005 if (arg_link_journal != LINK_AUTO) {
1006 log_error("%s: already a mount point, refusing to use for journal", q);
1013 r = readlink_and_make_absolute(p, &d);
1015 if ((arg_link_journal == LINK_GUEST ||
1016 arg_link_journal == LINK_AUTO) &&
1019 r = mkdir_p(q, 0755);
1021 log_warning("failed to create directory %s: %m", q);
1025 if (unlink(p) < 0) {
1026 log_error("Failed to remove symlink %s: %m", p);
1029 } else if (r == -EINVAL) {
1031 if (arg_link_journal == LINK_GUEST &&
1034 if (errno == ENOTDIR) {
1035 log_error("%s already exists and is neither a symlink nor a directory", p);
1038 log_error("Failed to remove %s: %m", p);
1042 } else if (r != -ENOENT) {
1043 log_error("readlink(%s) failed: %m", p);
1047 if (arg_link_journal == LINK_GUEST) {
1049 if (symlink(q, p) < 0) {
1050 log_error("Failed to symlink %s to %s: %m", q, p);
1054 r = mkdir_p(q, 0755);
1056 log_warning("failed to create directory %s: %m", q);
1060 if (arg_link_journal == LINK_HOST) {
1061 r = mkdir_p(p, 0755);
1063 log_error("Failed to create %s: %m", p);
1067 } else if (access(p, F_OK) < 0)
1070 if (dir_is_empty(q) == 0) {
1071 log_error("%s not empty.", q);
1075 r = mkdir_p(q, 0755);
1077 log_error("Failed to create %s: %m", q);
1081 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1082 log_error("Failed to bind mount journal from host into guest: %m");
1089 static int setup_kdbus(const char *dest, const char *path) {
1095 p = strappenda(dest, "/dev/kdbus");
1096 if (mkdir(p, 0755) < 0) {
1097 log_error("Failed to create kdbus path: %m");
1101 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1102 log_error("Failed to mount kdbus domain path: %m");
1109 static int drop_capabilities(void) {
1110 return capability_bounding_set_drop(~arg_retain, false);
1113 static int register_machine(pid_t pid) {
1114 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1115 _cleanup_bus_unref_ sd_bus *bus = NULL;
1121 r = sd_bus_default_system(&bus);
1123 log_error("Failed to open system bus: %s", strerror(-r));
1127 if (arg_keep_unit) {
1128 r = sd_bus_call_method(
1130 "org.freedesktop.machine1",
1131 "/org/freedesktop/machine1",
1132 "org.freedesktop.machine1.Manager",
1138 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1142 strempty(arg_directory));
1144 r = sd_bus_call_method(
1146 "org.freedesktop.machine1",
1147 "/org/freedesktop/machine1",
1148 "org.freedesktop.machine1.Manager",
1154 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1158 strempty(arg_directory),
1159 !isempty(arg_slice), "Slice", "s", arg_slice);
1163 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1170 static int terminate_machine(pid_t pid) {
1171 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1172 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1173 _cleanup_bus_unref_ sd_bus *bus = NULL;
1180 r = sd_bus_default_system(&bus);
1182 log_error("Failed to open system bus: %s", strerror(-r));
1186 r = sd_bus_call_method(
1188 "org.freedesktop.machine1",
1189 "/org/freedesktop/machine1",
1190 "org.freedesktop.machine1.Manager",
1197 /* Note that the machine might already have been
1198 * cleaned up automatically, hence don't consider it a
1199 * failure if we cannot get the machine object. */
1200 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1204 r = sd_bus_message_read(reply, "o", &path);
1206 return bus_log_parse_error(r);
1208 r = sd_bus_call_method(
1210 "org.freedesktop.machine1",
1212 "org.freedesktop.machine1.Machine",
1218 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1225 static int reset_audit_loginuid(void) {
1226 _cleanup_free_ char *p = NULL;
1229 if (arg_share_system)
1232 r = read_one_line_file("/proc/self/loginuid", &p);
1236 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1240 /* Already reset? */
1241 if (streq(p, "4294967295"))
1244 r = write_string_file("/proc/self/loginuid", "4294967295");
1246 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1247 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1248 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1249 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1250 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1258 static int move_network_interfaces(pid_t pid) {
1259 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1260 _cleanup_udev_unref_ struct udev *udev = NULL;
1264 if (!arg_private_network)
1267 if (strv_isempty(arg_network_interfaces))
1270 r = sd_rtnl_open(0, &rtnl);
1272 log_error("Failed to connect to netlink: %s", strerror(-r));
1278 log_error("Failed to connect to udev.");
1282 STRV_FOREACH(i, arg_network_interfaces) {
1283 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1284 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1285 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1288 ifi = (int) if_nametoindex(*i);
1290 log_error("Failed to resolve interface %s: %m", *i);
1294 sprintf(ifi_str, "n%i", ifi);
1295 d = udev_device_new_from_device_id(udev, ifi_str);
1297 log_error("Failed to get udev device for interface %s: %m", *i);
1301 if (udev_device_get_is_initialized(d) <= 0) {
1302 log_error("Network interface %s is not initialized yet.", *i);
1306 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1308 log_error("Failed to allocate netlink message: %s", strerror(-r));
1312 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1314 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1318 r = sd_rtnl_call(rtnl, m, 0, NULL);
1320 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1328 int main(int argc, char *argv[]) {
1330 int r = EXIT_FAILURE, k;
1331 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1333 const char *console = NULL;
1335 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1336 _cleanup_fdset_free_ FDSet *fds = NULL;
1337 _cleanup_free_ char *kdbus_domain = NULL;
1339 log_parse_environment();
1342 k = parse_argv(argc, argv);
1350 if (arg_directory) {
1353 p = path_make_absolute_cwd(arg_directory);
1354 free(arg_directory);
1357 arg_directory = get_current_dir_name();
1359 if (!arg_directory) {
1360 log_error("Failed to determine path, please use -D.");
1364 path_kill_slashes(arg_directory);
1367 arg_machine = strdup(basename(arg_directory));
1373 hostname_cleanup(arg_machine, false);
1374 if (isempty(arg_machine)) {
1375 log_error("Failed to determine machine name automatically, please use -M.");
1380 if (geteuid() != 0) {
1381 log_error("Need to be root.");
1385 if (sd_booted() <= 0) {
1386 log_error("Not running on a systemd system.");
1390 if (path_equal(arg_directory, "/")) {
1391 log_error("Spawning container on root directory not supported.");
1395 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1396 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1401 n_fd_passed = sd_listen_fds(false);
1402 if (n_fd_passed > 0) {
1403 k = fdset_new_listen_fds(&fds, false);
1405 log_error("Failed to collect file descriptors: %s", strerror(-k));
1409 fdset_close_others(fds);
1412 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1414 log_error("Failed to acquire pseudo tty: %m");
1418 console = ptsname(master);
1420 log_error("Failed to determine tty name: %m");
1425 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1427 if (unlockpt(master) < 0) {
1428 log_error("Failed to unlock tty: %m");
1433 if (access("/dev/kdbus/control", F_OK) >= 0) {
1435 if (arg_share_system) {
1436 kdbus_domain = strdup("/dev/kdbus");
1437 if (!kdbus_domain) {
1444 ns = strappenda("machine-", arg_machine);
1445 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1447 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1449 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1453 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1454 log_error("Failed to create kmsg socket pair: %m");
1458 sd_notify(0, "READY=1");
1460 assert_se(sigemptyset(&mask) == 0);
1461 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1462 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1467 sync_fd = eventfd(0, EFD_CLOEXEC);
1469 log_error("Failed to create event fd: %m");
1473 pid = syscall(__NR_clone,
1474 SIGCHLD|CLONE_NEWNS|
1475 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1476 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1478 if (errno == EINVAL)
1479 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1481 log_error("clone() failed: %m");
1488 const char *home = NULL;
1489 uid_t uid = (uid_t) -1;
1490 gid_t gid = (gid_t) -1;
1492 const char *envp[] = {
1493 "PATH=" DEFAULT_PATH_SPLIT_USR,
1494 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1499 NULL, /* container_uuid */
1500 NULL, /* LISTEN_FDS */
1501 NULL, /* LISTEN_PID */
1507 envp[n_env] = strv_find_prefix(environ, "TERM=");
1511 close_nointr_nofail(master);
1514 close_nointr(STDIN_FILENO);
1515 close_nointr(STDOUT_FILENO);
1516 close_nointr(STDERR_FILENO);
1518 close_nointr_nofail(kmsg_socket_pair[0]);
1519 kmsg_socket_pair[0] = -1;
1521 reset_all_signal_handlers();
1523 assert_se(sigemptyset(&mask) == 0);
1524 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1526 k = open_terminal(console, O_RDWR);
1527 if (k != STDIN_FILENO) {
1529 close_nointr_nofail(k);
1533 log_error("Failed to open console: %s", strerror(-k));
1537 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1538 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1539 log_error("Failed to duplicate console: %m");
1544 log_error("setsid() failed: %m");
1548 if (reset_audit_loginuid() < 0)
1551 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1552 log_error("PR_SET_PDEATHSIG failed: %m");
1556 /* Mark everything as slave, so that we still
1557 * receive mounts from the real root, but don't
1558 * propagate mounts to the real root. */
1559 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1560 log_error("MS_SLAVE|MS_REC failed: %m");
1564 /* Turn directory into bind mount */
1565 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1566 log_error("Failed to make bind mount.");
1571 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1572 log_error("Failed to make read-only.");
1576 if (mount_all(arg_directory) < 0)
1579 if (copy_devnodes(arg_directory) < 0)
1582 if (setup_ptmx(arg_directory) < 0)
1585 dev_setup(arg_directory);
1587 if (setup_dev_console(arg_directory, console) < 0)
1590 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1593 close_nointr_nofail(kmsg_socket_pair[1]);
1594 kmsg_socket_pair[1] = -1;
1596 if (setup_boot_id(arg_directory) < 0)
1599 if (setup_timezone(arg_directory) < 0)
1602 if (setup_resolv_conf(arg_directory) < 0)
1605 if (setup_journal(arg_directory) < 0)
1608 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1611 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1614 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1617 if (chdir(arg_directory) < 0) {
1618 log_error("chdir(%s) failed: %m", arg_directory);
1622 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1623 log_error("mount(MS_MOVE) failed: %m");
1627 if (chroot(".") < 0) {
1628 log_error("chroot() failed: %m");
1632 if (chdir("/") < 0) {
1633 log_error("chdir() failed: %m");
1639 if (arg_private_network)
1642 if (drop_capabilities() < 0) {
1643 log_error("drop_capabilities() failed: %m");
1649 /* Note that this resolves user names
1650 * inside the container, and hence
1651 * accesses the NSS modules from the
1652 * container and not the host. This is
1655 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1656 log_error("get_user_creds() failed: %m");
1660 if (mkdir_parents_label(home, 0775) < 0) {
1661 log_error("mkdir_parents_label() failed: %m");
1665 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1666 log_error("mkdir_safe_label() failed: %m");
1670 if (initgroups((const char*)arg_user, gid) < 0) {
1671 log_error("initgroups() failed: %m");
1675 if (setresgid(gid, gid, gid) < 0) {
1676 log_error("setregid() failed: %m");
1680 if (setresuid(uid, uid, uid) < 0) {
1681 log_error("setreuid() failed: %m");
1685 /* Reset everything fully to 0, just in case */
1687 if (setgroups(0, NULL) < 0) {
1688 log_error("setgroups() failed: %m");
1692 if (setresgid(0, 0, 0) < 0) {
1693 log_error("setregid() failed: %m");
1697 if (setresuid(0, 0, 0) < 0) {
1698 log_error("setreuid() failed: %m");
1703 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1704 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1705 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1710 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1711 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1717 if (fdset_size(fds) > 0) {
1718 k = fdset_cloexec(fds, false);
1720 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1724 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1725 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1733 eventfd_read(sync_fd, &x);
1734 close_nointr_nofail(sync_fd);
1737 if (!strv_isempty(arg_setenv)) {
1740 n = strv_env_merge(2, envp, arg_setenv);
1748 env_use = (char**) envp;
1751 if (arg_selinux_context)
1752 if (setexeccon(arg_selinux_context) < 0)
1753 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1759 /* Automatically search for the init system */
1761 l = 1 + argc - optind;
1762 a = newa(char*, l + 1);
1763 memcpy(a + 1, argv + optind, l * sizeof(char*));
1765 a[0] = (char*) "/usr/lib/systemd/systemd";
1766 execve(a[0], a, env_use);
1768 a[0] = (char*) "/lib/systemd/systemd";
1769 execve(a[0], a, env_use);
1771 a[0] = (char*) "/sbin/init";
1772 execve(a[0], a, env_use);
1773 } else if (argc > optind)
1774 execvpe(argv[optind], argv + optind, env_use);
1776 chdir(home ? home : "/root");
1777 execle("/bin/bash", "-bash", NULL, env_use);
1780 log_error("execv() failed: %m");
1783 _exit(EXIT_FAILURE);
1789 r = register_machine(pid);
1793 r = move_network_interfaces(pid);
1797 eventfd_write(sync_fd, 1);
1798 close_nointr_nofail(sync_fd);
1801 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1810 /* Kill if it is not dead yet anyway */
1811 terminate_machine(pid);
1813 /* Redundant, but better safe than sorry */
1816 k = wait_for_terminate(pid, &status);
1824 if (status.si_code == CLD_EXITED) {
1825 r = status.si_status;
1826 if (status.si_status != 0) {
1827 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1832 log_debug("Container %s exited successfully.", arg_machine);
1834 } else if (status.si_code == CLD_KILLED &&
1835 status.si_status == SIGINT) {
1838 log_info("Container %s has been shut down.", arg_machine);
1841 } else if (status.si_code == CLD_KILLED &&
1842 status.si_status == SIGHUP) {
1845 log_info("Container %s is being rebooted.", arg_machine);
1847 } else if (status.si_code == CLD_KILLED ||
1848 status.si_code == CLD_DUMPED) {
1850 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1854 log_error("Container %s failed due to unknown reason.", arg_machine);
1864 free(arg_directory);
1867 free(arg_network_interfaces);