1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
49 #include <selinux/selinux.h>
56 #include "sd-daemon.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
77 #include "bus-kernel.h"
80 #include "rtnl-util.h"
81 #include "udev-util.h"
84 #include "seccomp-util.h"
87 typedef enum LinkJournal {
94 static char *arg_directory = NULL;
95 static char *arg_user = NULL;
96 static sd_id128_t arg_uuid = {};
97 static char *arg_machine = NULL;
98 static const char *arg_selinux_context = NULL;
99 static const char *arg_selinux_apifs_context = NULL;
100 static const char *arg_slice = NULL;
101 static bool arg_private_network = false;
102 static bool arg_read_only = false;
103 static bool arg_boot = false;
104 static LinkJournal arg_link_journal = LINK_AUTO;
105 static uint64_t arg_retain =
106 (1ULL << CAP_CHOWN) |
107 (1ULL << CAP_DAC_OVERRIDE) |
108 (1ULL << CAP_DAC_READ_SEARCH) |
109 (1ULL << CAP_FOWNER) |
110 (1ULL << CAP_FSETID) |
111 (1ULL << CAP_IPC_OWNER) |
113 (1ULL << CAP_LEASE) |
114 (1ULL << CAP_LINUX_IMMUTABLE) |
115 (1ULL << CAP_NET_BIND_SERVICE) |
116 (1ULL << CAP_NET_BROADCAST) |
117 (1ULL << CAP_NET_RAW) |
118 (1ULL << CAP_SETGID) |
119 (1ULL << CAP_SETFCAP) |
120 (1ULL << CAP_SETPCAP) |
121 (1ULL << CAP_SETUID) |
122 (1ULL << CAP_SYS_ADMIN) |
123 (1ULL << CAP_SYS_CHROOT) |
124 (1ULL << CAP_SYS_NICE) |
125 (1ULL << CAP_SYS_PTRACE) |
126 (1ULL << CAP_SYS_TTY_CONFIG) |
127 (1ULL << CAP_SYS_RESOURCE) |
128 (1ULL << CAP_SYS_BOOT) |
129 (1ULL << CAP_AUDIT_WRITE) |
130 (1ULL << CAP_AUDIT_CONTROL) |
132 static char **arg_bind = NULL;
133 static char **arg_bind_ro = NULL;
134 static char **arg_setenv = NULL;
135 static bool arg_quiet = false;
136 static bool arg_share_system = false;
137 static bool arg_register = true;
138 static bool arg_keep_unit = false;
139 static char **arg_network_interfaces = NULL;
140 static char **arg_network_macvlan = NULL;
141 static bool arg_network_veth = false;
142 static const char *arg_network_bridge = NULL;
143 static unsigned long arg_personality = 0xffffffffLU;
145 static int help(void) {
147 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
148 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
149 " -h --help Show this help\n"
150 " --version Print version string\n"
151 " -q --quiet Do not show status information\n"
152 " -D --directory=NAME Root directory for the container\n"
153 " -b --boot Boot up full system (i.e. invoke init)\n"
154 " -u --user=USER Run the command under specified user or uid\n"
155 " -M --machine=NAME Set the machine name for the container\n"
156 " --uuid=UUID Set a specific machine UUID for the container\n"
157 " -S --slice=SLICE Place the container in the specified slice\n"
158 " --private-network Disable network in container\n"
159 " --network-interface=INTERFACE\n"
160 " Assign an existing network interface to the\n"
162 " --network-macvlan=INTERFACE\n"
163 " Create a macvlan network interface based on an\n"
164 " existing network interface to the container\n"
165 " --network-veth Add a virtual ethernet connection between host\n"
167 " --network-bridge=INTERFACE\n"
168 " Add a virtual ethernet connection between host\n"
169 " and container and add it to an existing bridge on\n"
171 " -Z --selinux-context=SECLABEL\n"
172 " Set the SELinux security context to be used by\n"
173 " processes in the container\n"
174 " -L --selinux-apifs-context=SECLABEL\n"
175 " Set the SELinux security context to be used by\n"
176 " API/tmpfs file systems in the container\n"
177 " --capability=CAP In addition to the default, retain specified\n"
179 " --drop-capability=CAP Drop the specified capability from the default set\n"
180 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
181 " -j Equivalent to --link-journal=host\n"
182 " --read-only Mount the root directory read-only\n"
183 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
185 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
186 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
187 " --share-system Share system namespaces with host\n"
188 " --register=BOOLEAN Register container as machine\n"
189 " --keep-unit Do not register a scope for the machine, reuse\n"
190 " the service unit nspawn is running in\n",
191 program_invocation_short_name);
196 static int parse_argv(int argc, char *argv[]) {
212 ARG_NETWORK_INTERFACE,
219 static const struct option options[] = {
220 { "help", no_argument, NULL, 'h' },
221 { "version", no_argument, NULL, ARG_VERSION },
222 { "directory", required_argument, NULL, 'D' },
223 { "user", required_argument, NULL, 'u' },
224 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
225 { "boot", no_argument, NULL, 'b' },
226 { "uuid", required_argument, NULL, ARG_UUID },
227 { "read-only", no_argument, NULL, ARG_READ_ONLY },
228 { "capability", required_argument, NULL, ARG_CAPABILITY },
229 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
230 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
231 { "bind", required_argument, NULL, ARG_BIND },
232 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
233 { "machine", required_argument, NULL, 'M' },
234 { "slice", required_argument, NULL, 'S' },
235 { "setenv", required_argument, NULL, ARG_SETENV },
236 { "selinux-context", required_argument, NULL, 'Z' },
237 { "selinux-apifs-context", required_argument, NULL, 'L' },
238 { "quiet", no_argument, NULL, 'q' },
239 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
240 { "register", required_argument, NULL, ARG_REGISTER },
241 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
242 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
243 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
244 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
245 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
246 { "personality", required_argument, NULL, ARG_PERSONALITY },
251 uint64_t plus = 0, minus = 0;
256 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
264 puts(PACKAGE_STRING);
265 puts(SYSTEMD_FEATURES);
270 arg_directory = canonicalize_file_name(optarg);
271 if (!arg_directory) {
272 log_error("Invalid root directory: %m");
280 arg_user = strdup(optarg);
286 case ARG_NETWORK_BRIDGE:
287 arg_network_bridge = optarg;
291 case ARG_NETWORK_VETH:
292 arg_network_veth = true;
293 arg_private_network = true;
296 case ARG_NETWORK_INTERFACE:
297 if (strv_extend(&arg_network_interfaces, optarg) < 0)
300 arg_private_network = true;
303 case ARG_NETWORK_MACVLAN:
304 if (strv_extend(&arg_network_macvlan, optarg) < 0)
309 case ARG_PRIVATE_NETWORK:
310 arg_private_network = true;
318 r = sd_id128_from_string(optarg, &arg_uuid);
320 log_error("Invalid UUID: %s", optarg);
330 if (isempty(optarg)) {
335 if (!hostname_is_valid(optarg)) {
336 log_error("Invalid machine name: %s", optarg);
341 arg_machine = strdup(optarg);
349 arg_selinux_context = optarg;
353 arg_selinux_apifs_context = optarg;
357 arg_read_only = true;
361 case ARG_DROP_CAPABILITY: {
365 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
366 _cleanup_free_ char *t;
369 t = strndup(word, length);
373 if (streq(t, "all")) {
374 if (c == ARG_CAPABILITY)
375 plus = (uint64_t) -1;
377 minus = (uint64_t) -1;
379 if (cap_from_name(t, &cap) < 0) {
380 log_error("Failed to parse capability %s.", t);
384 if (c == ARG_CAPABILITY)
385 plus |= 1ULL << (uint64_t) cap;
387 minus |= 1ULL << (uint64_t) cap;
395 arg_link_journal = LINK_GUEST;
398 case ARG_LINK_JOURNAL:
399 if (streq(optarg, "auto"))
400 arg_link_journal = LINK_AUTO;
401 else if (streq(optarg, "no"))
402 arg_link_journal = LINK_NO;
403 else if (streq(optarg, "guest"))
404 arg_link_journal = LINK_GUEST;
405 else if (streq(optarg, "host"))
406 arg_link_journal = LINK_HOST;
408 log_error("Failed to parse link journal mode %s", optarg);
416 _cleanup_free_ char *a = NULL, *b = NULL;
420 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
422 e = strchr(optarg, ':');
424 a = strndup(optarg, e - optarg);
434 if (!path_is_absolute(a) || !path_is_absolute(b)) {
435 log_error("Invalid bind mount specification: %s", optarg);
439 r = strv_extend(x, a);
443 r = strv_extend(x, b);
453 if (!env_assignment_is_valid(optarg)) {
454 log_error("Environment variable assignment '%s' is not valid.", optarg);
458 n = strv_env_set(arg_setenv, optarg);
462 strv_free(arg_setenv);
471 case ARG_SHARE_SYSTEM:
472 arg_share_system = true;
476 r = parse_boolean(optarg);
478 log_error("Failed to parse --register= argument: %s", optarg);
486 arg_keep_unit = true;
489 case ARG_PERSONALITY:
491 arg_personality = personality_from_string(optarg);
492 if (arg_personality == 0xffffffffLU) {
493 log_error("Unknown or unsupported personality '%s'.", optarg);
503 assert_not_reached("Unhandled option");
507 if (arg_share_system)
508 arg_register = false;
510 if (arg_boot && arg_share_system) {
511 log_error("--boot and --share-system may not be combined.");
515 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
516 log_error("--keep-unit may not be used when invoked from a user session.");
520 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
525 static int mount_all(const char *dest) {
527 typedef struct MountPoint {
536 static const MountPoint mount_table[] = {
537 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
538 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
539 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
540 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
541 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
542 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
543 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
544 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
546 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
547 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
554 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
555 _cleanup_free_ char *where = NULL;
557 _cleanup_free_ char *options = NULL;
562 where = strjoin(dest, "/", mount_table[k].where, NULL);
566 t = path_is_mount_point(where, true);
568 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
576 /* Skip this entry if it is not a remount. */
577 if (mount_table[k].what && t > 0)
580 mkdir_p(where, 0755);
583 if (arg_selinux_apifs_context &&
584 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
585 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
592 o = mount_table[k].options;
595 if (mount(mount_table[k].what,
598 mount_table[k].flags,
600 mount_table[k].fatal) {
602 log_error("mount(%s) failed: %m", where);
612 static int mount_binds(const char *dest, char **l, unsigned long flags) {
615 STRV_FOREACH_PAIR(x, y, l) {
617 struct stat source_st, dest_st;
620 if (stat(*x, &source_st) < 0) {
621 log_error("failed to stat %s: %m", *x);
625 where = strappenda(dest, *y);
626 r = stat(where, &dest_st);
628 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
629 log_error("The file types of %s and %s do not match. Refusing bind mount",
633 } else if (errno == ENOENT) {
634 r = mkdir_parents_label(where, 0755);
636 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
640 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
643 /* Create the mount point, but be conservative -- refuse to create block
644 * and char devices. */
645 if (S_ISDIR(source_st.st_mode))
646 mkdir_label(where, 0755);
647 else if (S_ISFIFO(source_st.st_mode))
649 else if (S_ISSOCK(source_st.st_mode))
650 mknod(where, 0644 | S_IFSOCK, 0);
651 else if (S_ISREG(source_st.st_mode))
654 log_error("Refusing to create mountpoint for file: %s", *x);
658 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
659 log_error("mount(%s) failed: %m", where);
663 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
664 log_error("mount(%s) failed: %m", where);
672 static int setup_timezone(const char *dest) {
673 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
679 /* Fix the timezone, if possible */
680 r = readlink_malloc("/etc/localtime", &p);
682 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
686 z = path_startswith(p, "../usr/share/zoneinfo/");
688 z = path_startswith(p, "/usr/share/zoneinfo/");
690 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
694 where = strappend(dest, "/etc/localtime");
698 r = readlink_malloc(where, &q);
700 y = path_startswith(q, "../usr/share/zoneinfo/");
702 y = path_startswith(q, "/usr/share/zoneinfo/");
705 /* Already pointing to the right place? Then do nothing .. */
706 if (y && streq(y, z))
710 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
714 if (access(check, F_OK) < 0) {
715 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
719 what = strappend("../usr/share/zoneinfo/", z);
724 if (symlink(what, where) < 0) {
725 log_error("Failed to correct timezone of container: %m");
732 static int setup_resolv_conf(const char *dest) {
733 char _cleanup_free_ *where = NULL;
737 if (arg_private_network)
740 /* Fix resolv.conf, if possible */
741 where = strappend(dest, "/etc/resolv.conf");
745 /* We don't really care for the results of this really. If it
746 * fails, it fails, but meh... */
747 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
752 static int setup_boot_id(const char *dest) {
753 _cleanup_free_ char *from = NULL, *to = NULL;
760 if (arg_share_system)
763 /* Generate a new randomized boot ID, so that each boot-up of
764 * the container gets a new one */
766 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
767 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
771 r = sd_id128_randomize(&rnd);
773 log_error("Failed to generate random boot id: %s", strerror(-r));
777 snprintf(as_uuid, sizeof(as_uuid),
778 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
779 SD_ID128_FORMAT_VAL(rnd));
780 char_array_0(as_uuid);
782 r = write_string_file(from, as_uuid);
784 log_error("Failed to write boot id: %s", strerror(-r));
788 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
789 log_error("Failed to bind mount boot id: %m");
791 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
792 log_warning("Failed to make boot id read-only: %m");
798 static int copy_devnodes(const char *dest) {
800 static const char devnodes[] =
810 _cleanup_umask_ mode_t u;
816 NULSTR_FOREACH(d, devnodes) {
817 _cleanup_free_ char *from = NULL, *to = NULL;
820 from = strappend("/dev/", d);
821 to = strjoin(dest, "/dev/", d, NULL);
825 if (stat(from, &st) < 0) {
827 if (errno != ENOENT) {
828 log_error("Failed to stat %s: %m", from);
832 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
834 log_error("%s is not a char or block device, cannot copy", from);
837 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
839 log_error("mknod(%s) failed: %m", dest);
847 static int setup_ptmx(const char *dest) {
848 _cleanup_free_ char *p = NULL;
850 p = strappend(dest, "/dev/ptmx");
854 if (symlink("pts/ptmx", p) < 0) {
855 log_error("Failed to create /dev/ptmx symlink: %m");
862 static int setup_dev_console(const char *dest, const char *console) {
864 _cleanup_free_ char *to = NULL;
866 _cleanup_umask_ mode_t u;
873 if (stat(console, &st) < 0) {
874 log_error("Failed to stat %s: %m", console);
877 } else if (!S_ISCHR(st.st_mode)) {
878 log_error("/dev/console is not a char device");
882 r = chmod_and_chown(console, 0600, 0, 0);
884 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
888 if (asprintf(&to, "%s/dev/console", dest) < 0)
891 /* We need to bind mount the right tty to /dev/console since
892 * ptys can only exist on pts file systems. To have something
893 * to bind mount things on we create a device node first, that
894 * has the right major/minor (note that the major minor
895 * doesn't actually matter here, since we mount it over
898 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
899 log_error("mknod() for /dev/console failed: %m");
903 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
904 log_error("Bind mount for /dev/console failed: %m");
911 static int setup_kmsg(const char *dest, int kmsg_socket) {
912 _cleanup_free_ char *from = NULL, *to = NULL;
914 _cleanup_umask_ mode_t u;
916 struct cmsghdr cmsghdr;
917 uint8_t buf[CMSG_SPACE(sizeof(int))];
920 .msg_control = &control,
921 .msg_controllen = sizeof(control),
923 struct cmsghdr *cmsg;
926 assert(kmsg_socket >= 0);
930 /* We create the kmsg FIFO as /dev/kmsg, but immediately
931 * delete it after bind mounting it to /proc/kmsg. While FIFOs
932 * on the reading side behave very similar to /proc/kmsg,
933 * their writing side behaves differently from /dev/kmsg in
934 * that writing blocks when nothing is reading. In order to
935 * avoid any problems with containers deadlocking due to this
936 * we simply make /dev/kmsg unavailable to the container. */
937 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
938 asprintf(&to, "%s/proc/kmsg", dest) < 0)
941 if (mkfifo(from, 0600) < 0) {
942 log_error("mkfifo() for /dev/kmsg failed: %m");
946 r = chmod_and_chown(from, 0600, 0, 0);
948 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
952 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
953 log_error("Bind mount for /proc/kmsg failed: %m");
957 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
959 log_error("Failed to open fifo: %m");
963 cmsg = CMSG_FIRSTHDR(&mh);
964 cmsg->cmsg_level = SOL_SOCKET;
965 cmsg->cmsg_type = SCM_RIGHTS;
966 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
967 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
969 mh.msg_controllen = cmsg->cmsg_len;
971 /* Store away the fd in the socket, so that it stays open as
972 * long as we run the child */
973 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
974 close_nointr_nofail(fd);
977 log_error("Failed to send FIFO fd: %m");
981 /* And now make the FIFO unavailable as /dev/kmsg... */
986 static int setup_hostname(void) {
988 if (arg_share_system)
991 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
997 static int setup_journal(const char *directory) {
998 sd_id128_t machine_id, this_id;
999 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1003 p = strappend(directory, "/etc/machine-id");
1007 r = read_one_line_file(p, &b);
1008 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1011 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1016 if (isempty(id) && arg_link_journal == LINK_AUTO)
1019 /* Verify validity */
1020 r = sd_id128_from_string(id, &machine_id);
1022 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1026 r = sd_id128_get_machine(&this_id);
1028 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1032 if (sd_id128_equal(machine_id, this_id)) {
1033 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1034 "Host and machine ids are equal (%s): refusing to link journals", id);
1035 if (arg_link_journal == LINK_AUTO)
1041 if (arg_link_journal == LINK_NO)
1045 p = strappend("/var/log/journal/", id);
1046 q = strjoin(directory, "/var/log/journal/", id, NULL);
1050 if (path_is_mount_point(p, false) > 0) {
1051 if (arg_link_journal != LINK_AUTO) {
1052 log_error("%s: already a mount point, refusing to use for journal", p);
1059 if (path_is_mount_point(q, false) > 0) {
1060 if (arg_link_journal != LINK_AUTO) {
1061 log_error("%s: already a mount point, refusing to use for journal", q);
1068 r = readlink_and_make_absolute(p, &d);
1070 if ((arg_link_journal == LINK_GUEST ||
1071 arg_link_journal == LINK_AUTO) &&
1074 r = mkdir_p(q, 0755);
1076 log_warning("failed to create directory %s: %m", q);
1080 if (unlink(p) < 0) {
1081 log_error("Failed to remove symlink %s: %m", p);
1084 } else if (r == -EINVAL) {
1086 if (arg_link_journal == LINK_GUEST &&
1089 if (errno == ENOTDIR) {
1090 log_error("%s already exists and is neither a symlink nor a directory", p);
1093 log_error("Failed to remove %s: %m", p);
1097 } else if (r != -ENOENT) {
1098 log_error("readlink(%s) failed: %m", p);
1102 if (arg_link_journal == LINK_GUEST) {
1104 if (symlink(q, p) < 0) {
1105 log_error("Failed to symlink %s to %s: %m", q, p);
1109 r = mkdir_p(q, 0755);
1111 log_warning("failed to create directory %s: %m", q);
1115 if (arg_link_journal == LINK_HOST) {
1116 r = mkdir_p(p, 0755);
1118 log_error("Failed to create %s: %m", p);
1122 } else if (access(p, F_OK) < 0)
1125 if (dir_is_empty(q) == 0) {
1126 log_error("%s not empty.", q);
1130 r = mkdir_p(q, 0755);
1132 log_error("Failed to create %s: %m", q);
1136 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1137 log_error("Failed to bind mount journal from host into guest: %m");
1144 static int setup_kdbus(const char *dest, const char *path) {
1150 p = strappenda(dest, "/dev/kdbus");
1151 if (mkdir(p, 0755) < 0) {
1152 log_error("Failed to create kdbus path: %m");
1156 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1157 log_error("Failed to mount kdbus domain path: %m");
1164 static int drop_capabilities(void) {
1165 return capability_bounding_set_drop(~arg_retain, false);
1168 static int register_machine(pid_t pid) {
1169 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1170 _cleanup_bus_unref_ sd_bus *bus = NULL;
1176 r = sd_bus_default_system(&bus);
1178 log_error("Failed to open system bus: %s", strerror(-r));
1182 if (arg_keep_unit) {
1183 r = sd_bus_call_method(
1185 "org.freedesktop.machine1",
1186 "/org/freedesktop/machine1",
1187 "org.freedesktop.machine1.Manager",
1193 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1197 strempty(arg_directory));
1199 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1201 r = sd_bus_message_new_method_call(
1204 "org.freedesktop.machine1",
1205 "/org/freedesktop/machine1",
1206 "org.freedesktop.machine1.Manager",
1209 log_error("Failed to create message: %s", strerror(-r));
1213 r = sd_bus_message_append(
1217 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1221 strempty(arg_directory));
1223 log_error("Failed to append message arguments: %s", strerror(-r));
1227 r = sd_bus_message_open_container(m, 'a', "(sv)");
1229 log_error("Failed to open container: %s", strerror(-r));
1233 if (!isempty(arg_slice)) {
1234 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1236 log_error("Failed to append slice: %s", strerror(-r));
1241 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1243 log_error("Failed to add device policy: %s", strerror(-r));
1247 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 8,
1248 /* Allow the container to
1249 * access and create the API
1250 * device nodes, so that
1251 * PrivateDevices= in the
1252 * container can work
1257 "/dev/random", "rwm",
1258 "/dev/urandom", "rwm",
1260 /* Allow the container
1261 * access to ptys. However,
1263 * container to ever create
1264 * these device nodes. */
1265 "/dev/pts/ptmx", "rw",
1268 log_error("Failed to add device whitelist: %s", strerror(-r));
1272 r = sd_bus_message_close_container(m);
1274 log_error("Failed to close container: %s", strerror(-r));
1278 r = sd_bus_call(bus, m, 0, &error, NULL);
1282 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1289 static int terminate_machine(pid_t pid) {
1290 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1291 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1292 _cleanup_bus_unref_ sd_bus *bus = NULL;
1299 r = sd_bus_default_system(&bus);
1301 log_error("Failed to open system bus: %s", strerror(-r));
1305 r = sd_bus_call_method(
1307 "org.freedesktop.machine1",
1308 "/org/freedesktop/machine1",
1309 "org.freedesktop.machine1.Manager",
1316 /* Note that the machine might already have been
1317 * cleaned up automatically, hence don't consider it a
1318 * failure if we cannot get the machine object. */
1319 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1323 r = sd_bus_message_read(reply, "o", &path);
1325 return bus_log_parse_error(r);
1327 r = sd_bus_call_method(
1329 "org.freedesktop.machine1",
1331 "org.freedesktop.machine1.Machine",
1337 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1344 static int reset_audit_loginuid(void) {
1345 _cleanup_free_ char *p = NULL;
1348 if (arg_share_system)
1351 r = read_one_line_file("/proc/self/loginuid", &p);
1355 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1359 /* Already reset? */
1360 if (streq(p, "4294967295"))
1363 r = write_string_file("/proc/self/loginuid", "4294967295");
1365 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1366 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1367 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1368 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1369 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1377 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1378 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1379 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1382 if (!arg_private_network)
1385 if (!arg_network_veth)
1388 /* Use two different interface name prefixes depending whether
1389 * we are in bridge mode or not. */
1390 if (arg_network_bridge)
1391 memcpy(iface_name, "vb-", 3);
1393 memcpy(iface_name, "ve-", 3);
1395 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1397 r = sd_rtnl_open(&rtnl, 0);
1399 log_error("Failed to connect to netlink: %s", strerror(-r));
1403 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1405 log_error("Failed to allocate netlink message: %s", strerror(-r));
1409 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1411 log_error("Failed to add netlink interface name: %s", strerror(-r));
1415 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1417 log_error("Failed to open netlink container: %s", strerror(-r));
1421 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1423 log_error("Failed to append netlink kind: %s", strerror(-r));
1427 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1429 log_error("Failed to open netlink container: %s", strerror(-r));
1433 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1435 log_error("Failed to open netlink container: %s", strerror(-r));
1439 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1441 log_error("Failed to add netlink interface name: %s", strerror(-r));
1445 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1447 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1451 r = sd_rtnl_message_close_container(m);
1453 log_error("Failed to close netlink container: %s", strerror(-r));
1457 r = sd_rtnl_message_close_container(m);
1459 log_error("Failed to close netlink container: %s", strerror(-r));
1463 r = sd_rtnl_message_close_container(m);
1465 log_error("Failed to close netlink container: %s", strerror(-r));
1469 r = sd_rtnl_call(rtnl, m, 0, NULL);
1471 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1478 static int setup_bridge(const char veth_name[]) {
1479 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1480 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1483 if (!arg_private_network)
1486 if (!arg_network_veth)
1489 if (!arg_network_bridge)
1492 bridge = (int) if_nametoindex(arg_network_bridge);
1494 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1498 r = sd_rtnl_open(&rtnl, 0);
1500 log_error("Failed to connect to netlink: %s", strerror(-r));
1504 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1506 log_error("Failed to allocate netlink message: %s", strerror(-r));
1510 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1512 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1516 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1518 log_error("Failed to add netlink master field: %s", strerror(-r));
1522 r = sd_rtnl_call(rtnl, m, 0, NULL);
1524 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1531 static int parse_interface(struct udev *udev, const char *name) {
1532 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1533 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1536 ifi = (int) if_nametoindex(name);
1538 log_error("Failed to resolve interface %s: %m", name);
1542 sprintf(ifi_str, "n%i", ifi);
1543 d = udev_device_new_from_device_id(udev, ifi_str);
1545 log_error("Failed to get udev device for interface %s: %m", name);
1549 if (udev_device_get_is_initialized(d) <= 0) {
1550 log_error("Network interface %s is not initialized yet.", name);
1557 static int move_network_interfaces(pid_t pid) {
1558 _cleanup_udev_unref_ struct udev *udev = NULL;
1559 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1563 if (!arg_private_network)
1566 if (strv_isempty(arg_network_interfaces))
1569 r = sd_rtnl_open(&rtnl, 0);
1571 log_error("Failed to connect to netlink: %s", strerror(-r));
1577 log_error("Failed to connect to udev.");
1581 STRV_FOREACH(i, arg_network_interfaces) {
1582 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1585 ifi = parse_interface(udev, *i);
1589 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1591 log_error("Failed to allocate netlink message: %s", strerror(-r));
1595 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1597 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1601 r = sd_rtnl_call(rtnl, m, 0, NULL);
1603 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1611 static int setup_macvlan(pid_t pid) {
1612 _cleanup_udev_unref_ struct udev *udev = NULL;
1613 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1617 if (!arg_private_network)
1620 if (strv_isempty(arg_network_macvlan))
1623 r = sd_rtnl_open(&rtnl, 0);
1625 log_error("Failed to connect to netlink: %s", strerror(-r));
1631 log_error("Failed to connect to udev.");
1635 STRV_FOREACH(i, arg_network_macvlan) {
1636 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1637 _cleanup_free_ char *n = NULL;
1640 ifi = parse_interface(udev, *i);
1644 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1646 log_error("Failed to allocate netlink message: %s", strerror(-r));
1650 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1652 log_error("Failed to add netlink interface index: %s", strerror(-r));
1656 n = strappend("mv-", *i);
1660 strshorten(n, IFNAMSIZ-1);
1662 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1664 log_error("Failed to add netlink interface name: %s", strerror(-r));
1668 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1670 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1674 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1676 log_error("Failed to open netlink container: %s", strerror(-r));
1680 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1682 log_error("Failed to append netlink kind: %s", strerror(-r));
1686 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1688 log_error("Failed to open netlink container: %s", strerror(-r));
1692 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1694 log_error("Failed to append macvlan mode: %s", strerror(-r));
1698 r = sd_rtnl_message_close_container(m);
1700 log_error("Failed to close netlink container: %s", strerror(-r));
1704 r = sd_rtnl_message_close_container(m);
1706 log_error("Failed to close netlink container: %s", strerror(-r));
1710 r = sd_rtnl_call(rtnl, m, 0, NULL);
1712 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1720 static int audit_still_doesnt_work_in_containers(void) {
1723 scmp_filter_ctx seccomp;
1727 Audit is broken in containers, much of the userspace audit
1728 hookup will fail if running inside a container. We don't
1729 care and just turn off creation of audit sockets.
1731 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1732 with EAFNOSUPPORT which audit userspace uses as indication
1733 that audit is disabled in the kernel.
1736 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1740 r = seccomp_add_secondary_archs(seccomp);
1742 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1746 r = seccomp_rule_add(
1748 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1751 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1752 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1754 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1758 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1760 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1764 r = seccomp_load(seccomp);
1766 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1769 seccomp_release(seccomp);
1777 int main(int argc, char *argv[]) {
1779 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1780 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1781 _cleanup_free_ char *kdbus_domain = NULL;
1782 _cleanup_fdset_free_ FDSet *fds = NULL;
1783 const char *console = NULL;
1784 int r = EXIT_FAILURE, k;
1788 char veth_name[IFNAMSIZ];
1790 log_parse_environment();
1793 k = parse_argv(argc, argv);
1801 if (arg_directory) {
1804 p = path_make_absolute_cwd(arg_directory);
1805 free(arg_directory);
1808 arg_directory = get_current_dir_name();
1810 if (!arg_directory) {
1811 log_error("Failed to determine path, please use -D.");
1815 path_kill_slashes(arg_directory);
1818 arg_machine = strdup(basename(arg_directory));
1824 hostname_cleanup(arg_machine, false);
1825 if (isempty(arg_machine)) {
1826 log_error("Failed to determine machine name automatically, please use -M.");
1831 if (geteuid() != 0) {
1832 log_error("Need to be root.");
1836 if (sd_booted() <= 0) {
1837 log_error("Not running on a systemd system.");
1841 if (path_equal(arg_directory, "/")) {
1842 log_error("Spawning container on root directory not supported.");
1847 if (path_is_os_tree(arg_directory) <= 0) {
1848 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1854 p = strappenda(arg_directory,
1855 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
1856 if (access(p, F_OK) < 0) {
1857 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
1864 n_fd_passed = sd_listen_fds(false);
1865 if (n_fd_passed > 0) {
1866 k = fdset_new_listen_fds(&fds, false);
1868 log_error("Failed to collect file descriptors: %s", strerror(-k));
1872 fdset_close_others(fds);
1875 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1877 log_error("Failed to acquire pseudo tty: %m");
1881 console = ptsname(master);
1883 log_error("Failed to determine tty name: %m");
1888 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1890 if (unlockpt(master) < 0) {
1891 log_error("Failed to unlock tty: %m");
1895 if (access("/dev/kdbus/control", F_OK) >= 0) {
1897 if (arg_share_system) {
1898 kdbus_domain = strdup("/dev/kdbus");
1899 if (!kdbus_domain) {
1906 ns = strappenda("machine-", arg_machine);
1907 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1909 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1911 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1915 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1916 log_error("Failed to create kmsg socket pair: %m");
1920 sd_notify(0, "READY=1");
1922 assert_se(sigemptyset(&mask) == 0);
1923 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1924 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1929 sync_fd = eventfd(0, EFD_CLOEXEC);
1931 log_error("Failed to create event fd: %m");
1935 pid = syscall(__NR_clone,
1936 SIGCHLD|CLONE_NEWNS|
1937 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1938 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1940 if (errno == EINVAL)
1941 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1943 log_error("clone() failed: %m");
1950 const char *home = NULL;
1951 uid_t uid = (uid_t) -1;
1952 gid_t gid = (gid_t) -1;
1954 const char *envp[] = {
1955 "PATH=" DEFAULT_PATH_SPLIT_USR,
1956 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1961 NULL, /* container_uuid */
1962 NULL, /* LISTEN_FDS */
1963 NULL, /* LISTEN_PID */
1969 envp[n_env] = strv_find_prefix(environ, "TERM=");
1973 close_nointr_nofail(master);
1976 close_nointr(STDIN_FILENO);
1977 close_nointr(STDOUT_FILENO);
1978 close_nointr(STDERR_FILENO);
1980 close_nointr_nofail(kmsg_socket_pair[0]);
1981 kmsg_socket_pair[0] = -1;
1983 reset_all_signal_handlers();
1985 assert_se(sigemptyset(&mask) == 0);
1986 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1988 k = open_terminal(console, O_RDWR);
1989 if (k != STDIN_FILENO) {
1991 close_nointr_nofail(k);
1995 log_error("Failed to open console: %s", strerror(-k));
1999 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2000 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2001 log_error("Failed to duplicate console: %m");
2006 log_error("setsid() failed: %m");
2010 if (reset_audit_loginuid() < 0)
2013 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2014 log_error("PR_SET_PDEATHSIG failed: %m");
2018 /* Mark everything as slave, so that we still
2019 * receive mounts from the real root, but don't
2020 * propagate mounts to the real root. */
2021 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2022 log_error("MS_SLAVE|MS_REC failed: %m");
2026 /* Turn directory into bind mount */
2027 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2028 log_error("Failed to make bind mount.");
2033 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2034 log_error("Failed to make read-only.");
2038 if (mount_all(arg_directory) < 0)
2041 if (copy_devnodes(arg_directory) < 0)
2044 if (setup_ptmx(arg_directory) < 0)
2047 dev_setup(arg_directory);
2049 if (audit_still_doesnt_work_in_containers() < 0)
2052 if (setup_dev_console(arg_directory, console) < 0)
2055 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2058 close_nointr_nofail(kmsg_socket_pair[1]);
2059 kmsg_socket_pair[1] = -1;
2061 if (setup_boot_id(arg_directory) < 0)
2064 if (setup_timezone(arg_directory) < 0)
2067 if (setup_resolv_conf(arg_directory) < 0)
2070 if (setup_journal(arg_directory) < 0)
2073 if (mount_binds(arg_directory, arg_bind, 0) < 0)
2076 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2079 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2082 if (chdir(arg_directory) < 0) {
2083 log_error("chdir(%s) failed: %m", arg_directory);
2087 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2088 log_error("mount(MS_MOVE) failed: %m");
2092 if (chroot(".") < 0) {
2093 log_error("chroot() failed: %m");
2097 if (chdir("/") < 0) {
2098 log_error("chdir() failed: %m");
2104 if (arg_private_network)
2107 if (drop_capabilities() < 0) {
2108 log_error("drop_capabilities() failed: %m");
2114 /* Note that this resolves user names
2115 * inside the container, and hence
2116 * accesses the NSS modules from the
2117 * container and not the host. This is
2120 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
2121 log_error("get_user_creds() failed: %m");
2125 if (mkdir_parents_label(home, 0775) < 0) {
2126 log_error("mkdir_parents_label() failed: %m");
2130 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
2131 log_error("mkdir_safe_label() failed: %m");
2135 if (initgroups((const char*)arg_user, gid) < 0) {
2136 log_error("initgroups() failed: %m");
2140 if (setresgid(gid, gid, gid) < 0) {
2141 log_error("setregid() failed: %m");
2145 if (setresuid(uid, uid, uid) < 0) {
2146 log_error("setreuid() failed: %m");
2150 /* Reset everything fully to 0, just in case */
2152 if (setgroups(0, NULL) < 0) {
2153 log_error("setgroups() failed: %m");
2157 if (setresgid(0, 0, 0) < 0) {
2158 log_error("setregid() failed: %m");
2162 if (setresuid(0, 0, 0) < 0) {
2163 log_error("setreuid() failed: %m");
2168 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2169 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2170 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2175 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2176 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2182 if (fdset_size(fds) > 0) {
2183 k = fdset_cloexec(fds, false);
2185 log_error("Failed to unset O_CLOEXEC for file descriptors.");
2189 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2190 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2198 if (arg_personality != 0xffffffffLU) {
2199 if (personality(arg_personality) < 0) {
2200 log_error("personality() failed: %m");
2205 eventfd_read(sync_fd, &x);
2206 close_nointr_nofail(sync_fd);
2209 if (!strv_isempty(arg_setenv)) {
2212 n = strv_env_merge(2, envp, arg_setenv);
2220 env_use = (char**) envp;
2223 if (arg_selinux_context)
2224 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2225 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2231 /* Automatically search for the init system */
2233 l = 1 + argc - optind;
2234 a = newa(char*, l + 1);
2235 memcpy(a + 1, argv + optind, l * sizeof(char*));
2237 a[0] = (char*) "/usr/lib/systemd/systemd";
2238 execve(a[0], a, env_use);
2240 a[0] = (char*) "/lib/systemd/systemd";
2241 execve(a[0], a, env_use);
2243 a[0] = (char*) "/sbin/init";
2244 execve(a[0], a, env_use);
2245 } else if (argc > optind)
2246 execvpe(argv[optind], argv + optind, env_use);
2248 chdir(home ? home : "/root");
2249 execle("/bin/bash", "-bash", NULL, env_use);
2250 execle("/bin/sh", "-sh", NULL, env_use);
2253 log_error("execv() failed: %m");
2256 _exit(EXIT_FAILURE);
2262 r = register_machine(pid);
2266 r = move_network_interfaces(pid);
2270 r = setup_veth(pid, veth_name);
2274 r = setup_bridge(veth_name);
2278 r = setup_macvlan(pid);
2282 eventfd_write(sync_fd, 1);
2283 close_nointr_nofail(sync_fd);
2286 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
2295 /* Kill if it is not dead yet anyway */
2296 terminate_machine(pid);
2298 /* Redundant, but better safe than sorry */
2301 k = wait_for_terminate(pid, &status);
2309 if (status.si_code == CLD_EXITED) {
2310 r = status.si_status;
2311 if (status.si_status != 0) {
2312 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2317 log_debug("Container %s exited successfully.", arg_machine);
2319 } else if (status.si_code == CLD_KILLED &&
2320 status.si_status == SIGINT) {
2323 log_info("Container %s has been shut down.", arg_machine);
2326 } else if (status.si_code == CLD_KILLED &&
2327 status.si_status == SIGHUP) {
2330 log_info("Container %s is being rebooted.", arg_machine);
2332 } else if (status.si_code == CLD_KILLED ||
2333 status.si_code == CLD_DUMPED) {
2335 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2339 log_error("Container %s failed due to unknown reason.", arg_machine);
2349 free(arg_directory);
2352 strv_free(arg_setenv);
2353 strv_free(arg_network_interfaces);
2354 strv_free(arg_network_macvlan);
2355 strv_free(arg_bind);
2356 strv_free(arg_bind_ro);