#include <sched.h>
#include <unistd.h>
#include <sys/types.h>
-#include <sys/syscall.h>
#include <sys/mount.h>
-#include <sys/wait.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/prctl.h>
#include <getopt.h>
-#include <termios.h>
-#include <sys/signalfd.h>
#include <grp.h>
#include <linux/fs.h>
-#include <sys/un.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <net/if.h>
#include <linux/veth.h>
#include <sys/personality.h>
#include <linux/loop.h>
+#include <sys/file.h>
#ifdef HAVE_SELINUX
#include <selinux/selinux.h>
#include "util.h"
#include "mkdir.h"
#include "macro.h"
-#include "audit.h"
#include "missing.h"
#include "cgroup-util.h"
#include "strv.h"
#include "bus-util.h"
#include "bus-error.h"
#include "ptyfwd.h"
-#include "bus-kernel.h"
#include "env-util.h"
-#include "def.h"
#include "rtnl-util.h"
#include "udev-util.h"
#include "blkid-util.h"
static bool arg_keep_unit = false;
static char **arg_network_interfaces = NULL;
static char **arg_network_macvlan = NULL;
+static char **arg_network_ipvlan = NULL;
static bool arg_network_veth = false;
static const char *arg_network_bridge = NULL;
static unsigned long arg_personality = 0xffffffffLU;
static char *arg_image = NULL;
static Volatile arg_volatile = VOLATILE_NO;
static ExposePort *arg_expose_ports = NULL;
+static char **arg_property = NULL;
+static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
+static bool arg_userns = false;
+static int arg_kill_signal = 0;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
" -M --machine=NAME Set the machine name for the container\n"
" --uuid=UUID Set a specific machine UUID for the container\n"
" -S --slice=SLICE Place the container in the specified slice\n"
+ " --property=NAME=VALUE Set scope unit property\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n"
" --network-macvlan=INTERFACE\n"
" Create a macvlan network interface based on an\n"
" existing network interface to the container\n"
+ " --network-ipvlan=INTERFACE\n"
+ " Create a ipvlan network interface based on an\n"
+ " existing network interface to the container\n"
" -n --network-veth Add a virtual ethernet connection between host\n"
" and container\n"
" --network-bridge=INTERFACE\n"
" Add a virtual ethernet connection between host\n"
" and container and add it to an existing bridge on\n"
" the host\n"
+ " --private-users[=UIDBASE[:NUIDS]]\n"
+ " Run within user namespace\n"
" -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
" Expose a container IP port on the host\n"
" -Z --selinux-context=SECLABEL\n"
" --capability=CAP In addition to the default, retain specified\n"
" capability\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
+ " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
" try-guest, try-host\n"
" -j Equivalent to --link-journal=try-guest\n"
ARG_KEEP_UNIT,
ARG_NETWORK_INTERFACE,
ARG_NETWORK_MACVLAN,
+ ARG_NETWORK_IPVLAN,
ARG_NETWORK_BRIDGE,
ARG_PERSONALITY,
ARG_VOLATILE,
ARG_TEMPLATE,
+ ARG_PROPERTY,
+ ARG_PRIVATE_USERS,
+ ARG_KILL_SIGNAL,
};
static const struct option options[] = {
{ "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
{ "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
{ "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
+ { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
{ "network-veth", no_argument, NULL, 'n' },
{ "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
{ "personality", required_argument, NULL, ARG_PERSONALITY },
{ "image", required_argument, NULL, 'i' },
{ "volatile", optional_argument, NULL, ARG_VOLATILE },
{ "port", required_argument, NULL, 'p' },
+ { "property", required_argument, NULL, ARG_PROPERTY },
+ { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
+ { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
{}
};
if (strv_extend(&arg_network_macvlan, optarg) < 0)
return log_oom();
+ arg_private_network = true;
+ break;
+
+ case ARG_NETWORK_IPVLAN:
+ if (strv_extend(&arg_network_ipvlan, optarg) < 0)
+ return log_oom();
+
/* fall through */
case ARG_PRIVATE_NETWORK:
break;
}
+ case ARG_PROPERTY:
+ if (strv_extend(&arg_property, optarg) < 0)
+ return log_oom();
+
+ break;
+
+ case ARG_PRIVATE_USERS:
+ if (optarg) {
+ _cleanup_free_ char *buffer = NULL;
+ const char *range, *shift;
+
+ range = strchr(optarg, ':');
+ if (range) {
+ buffer = strndup(optarg, range - optarg);
+ if (!buffer)
+ return log_oom();
+ shift = buffer;
+
+ range++;
+ if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
+ log_error("Failed to parse UID range: %s", range);
+ return -EINVAL;
+ }
+ } else
+ shift = optarg;
+
+ if (parse_uid(shift, &arg_uid_shift) < 0) {
+ log_error("Failed to parse UID: %s", optarg);
+ return -EINVAL;
+ }
+ }
+
+ arg_userns = true;
+ break;
+
+ case ARG_KILL_SIGNAL:
+ arg_kill_signal = signal_from_string_try_harder(optarg);
+ if (arg_kill_signal < 0) {
+ log_error("Cannot parse signal: %s", optarg);
+ return -EINVAL;
+ }
+
+ break;
+
case '?':
return -EINVAL;
arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+ if (arg_boot && arg_kill_signal <= 0)
+ arg_kill_signal = SIGRTMIN+3;
+
return 1;
}
{ "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
#ifdef HAVE_SELINUX
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
int r = 0;
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
- _cleanup_free_ char *where = NULL;
-#ifdef HAVE_SELINUX
- _cleanup_free_ char *options = NULL;
-#endif
+ _cleanup_free_ char *where = NULL, *options = NULL;
const char *o;
int t;
#endif
o = mount_table[k].options;
+ if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
+ char *uid_options = NULL;
+
+ if (o)
+ asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
+ else
+ asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
+ if (!uid_options)
+ return log_oom();
+
+ free(options);
+ o = options = uid_options;
+ }
if (mount(mount_table[k].what,
where,
r = stat(where, &dest_st);
if (r == 0) {
- if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
- log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
+ if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
+ log_error("Cannot bind mount directory %s on file %s.", *x, where);
+ return -EINVAL;
+ }
+ if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
+ log_error("Cannot bind mount file %s on directory %s.", *x, where);
return -EINVAL;
}
} else if (errno == ENOENT) {
return -errno;
}
- /* Create the mount point, but be conservative -- refuse to create block
- * and char devices. */
+ /* Create the mount point. Any non-directory file can be
+ * mounted on any non-directory file (regular, fifo, socket,
+ * char, block).
+ */
if (S_ISDIR(source_st.st_mode)) {
r = mkdir_label(where, 0755);
if (r < 0 && errno != EEXIST)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
- } else if (S_ISFIFO(source_st.st_mode)) {
- r = mkfifo(where, 0644);
- if (r < 0 && errno != EEXIST)
- return log_error_errno(errno, "Failed to create mount point %s: %m", where);
- } else if (S_ISSOCK(source_st.st_mode)) {
- r = mknod(where, 0644 | S_IFSOCK, 0);
- if (r < 0 && errno != EEXIST)
- return log_error_errno(errno, "Failed to create mount point %s: %m", where);
- } else if (S_ISREG(source_st.st_mode)) {
+ } else {
r = touch(where);
if (r < 0)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
- } else {
- log_error("Refusing to create mountpoint for file: %s", *x);
- return -ENOTSUP;
}
if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
char *to;
int r;
- to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
+ to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
r = path_is_mount_point(to, false);
if (r < 0)
mkdir_p(to, 0755);
- if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
+ /* The superblock mount options of the mount point need to be
+ * identical to the hosts', and hence writable... */
+ if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
return log_error_errno(errno, "Failed to mount to %s: %m", to);
+ /* ... hence let's only make the bind mount read-only, not the
+ * superblock. */
+ if (read_only) {
+ if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
+ return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
+ }
return 1;
}
if (r < 0)
return log_error_errno(r, "Failed to determine our own cgroup path: %m");
- cgroup_root = strappenda(dest, "/sys/fs/cgroup");
+ cgroup_root = strjoina(dest, "/sys/fs/cgroup");
if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
return r;
if (symlink(combined, target) < 0)
- return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
+ return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
}
}
- r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
+ r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
if (r < 0)
return r;
/* Make our own cgroup a (writable) bind mount */
- systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+ systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
/* And then remount the systemd cgroup root read-only */
- systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
+ systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
if (r < 0)
return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
- p = strappenda(directory, "/var");
+ p = strjoina(directory, "/var");
r = mkdir(p, 0755);
if (r < 0 && errno != EEXIST)
return log_error_errno(errno, "Failed to create %s: %m", directory);
tmpfs_mounted = true;
- f = strappenda(directory, "/usr");
- t = strappenda(template, "/usr");
+ f = strjoina(directory, "/usr");
+ t = strjoina(template, "/usr");
r = mkdir(t, 0755);
if (r < 0 && errno != EEXIST) {
if (mknod(to, st.st_mode, st.st_rdev) < 0)
return log_error_errno(errno, "mknod(%s) failed: %m", to);
+
+ if (arg_userns && arg_uid_shift != UID_INVALID)
+ if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
+ return log_error_errno(errno, "chown() of device node %s failed: %m", to);
}
}
if (symlink("pts/ptmx", p) < 0)
return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
+ if (arg_userns && arg_uid_shift != UID_INVALID)
+ if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
+ return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
+
return 0;
}
* /dev/console. (Note that the major minor doesn't actually
* matter here, since we mount it over anyway). */
- to = strappenda(dest, "/dev/console");
+ to = strjoina(dest, "/dev/console");
if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
return log_error_errno(errno, "mknod() for /dev/console failed: %m");
local_ifindex > 0 ? 1 : 0, local_ifindex);
} else {
_cleanup_bus_message_unref_ sd_bus_message *m = NULL;
+ char **i;
r = sd_bus_message_new_method_call(
bus,
"org.freedesktop.machine1.Manager",
"CreateMachineWithNetwork");
if (r < 0)
- return log_error_errno(r, "Failed to create message: %m");
+ return bus_log_create_error(r);
r = sd_bus_message_append(
m,
strempty(arg_directory),
local_ifindex > 0 ? 1 : 0, local_ifindex);
if (r < 0)
- return log_error_errno(r, "Failed to append message arguments: %m");
+ return bus_log_create_error(r);
r = sd_bus_message_open_container(m, 'a', "(sv)");
if (r < 0)
- return log_error_errno(r, "Failed to open container: %m");
+ return bus_log_create_error(r);
if (!isempty(arg_slice)) {
r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
if (r < 0)
- return log_error_errno(r, "Failed to append slice: %m");
+ return bus_log_create_error(r);
}
r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
if (r < 0)
- return log_error_errno(r, "Failed to add device policy: %m");
+ return bus_log_create_error(r);
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
/* Allow the container to
if (r < 0)
return log_error_errno(r, "Failed to add device whitelist: %m");
+ STRV_FOREACH(i, arg_property) {
+ r = sd_bus_message_open_container(m, 'r', "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = bus_append_unit_property_assignment(m, *i);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
r = sd_bus_message_close_container(m);
if (r < 0)
- return log_error_errno(r, "Failed to close container: %m");
+ return bus_log_create_error(r);
r = sd_bus_call(bus, m, 0, &error, NULL);
}
return 0;
}
+static int setup_ipvlan(pid_t pid) {
+ _cleanup_udev_unref_ struct udev *udev = NULL;
+ _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+ char **i;
+ int r;
+
+ if (!arg_private_network)
+ return 0;
+
+ if (strv_isempty(arg_network_ipvlan))
+ return 0;
+
+ r = sd_rtnl_open(&rtnl, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to netlink: %m");
+
+ udev = udev_new();
+ if (!udev) {
+ log_error("Failed to connect to udev.");
+ return -ENOMEM;
+ }
+
+ STRV_FOREACH(i, arg_network_ipvlan) {
+ _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
+ _cleanup_free_ char *n = NULL;
+ int ifi;
+
+ ifi = parse_interface(udev, *i);
+ if (ifi < 0)
+ return ifi;
+
+ r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate netlink message: %m");
+
+ r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink interface index: %m");
+
+ n = strappend("iv-", *i);
+ if (!n)
+ return log_oom();
+
+ strshorten(n, IFNAMSIZ-1);
+
+ r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink interface name: %m");
+
+ r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink namespace field: %m");
+
+ r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open netlink container: %m");
+
+ r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
+ if (r < 0)
+ return log_error_errno(r, "Failed to open netlink container: %m");
+
+ r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add ipvlan mode: %m");
+
+ r = sd_rtnl_message_close_container(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to close netlink container: %m");
+
+ r = sd_rtnl_message_close_container(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to close netlink container: %m");
+
+ r = sd_rtnl_call(rtnl, m, 0, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
+ }
+
+ return 0;
+}
+
static int setup_seccomp(void) {
#ifdef HAVE_SECCOMP
static const int blacklist[] = {
SCMP_SYS(kexec_load),
SCMP_SYS(open_by_handle_at),
- SCMP_SYS(init_module),
- SCMP_SYS(finit_module),
- SCMP_SYS(delete_module),
SCMP_SYS(iopl),
SCMP_SYS(ioperm),
SCMP_SYS(swapon),
SCMP_SYS(swapoff),
};
+ static const int kmod_blacklist[] = {
+ SCMP_SYS(init_module),
+ SCMP_SYS(finit_module),
+ SCMP_SYS(delete_module),
+ };
+
scmp_filter_ctx seccomp;
unsigned i;
int r;
}
}
+ /* If the CAP_SYS_MODULE capability is not requested then
+ * we'll block the kmod syscalls too */
+ if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
+ for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
+ r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
+ if (r == -EFAULT)
+ continue; /* unknown syscall */
+ if (r < 0) {
+ log_error_errno(r, "Failed to block syscall: %m");
+ goto finish;
+ }
+ }
+ }
+
/*
Audit is broken in containers, much of the userspace audit
hookup will fail if running inside a container. We don't
(void) mkdir_p("/run/systemd/nspawn/", 0755);
(void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
- p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
+ p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
(void) mkdir_p(p, 0600);
- q = strappenda(root, "/run/systemd/nspawn/incoming");
+ q = strjoina(root, "/run/systemd/nspawn/incoming");
mkdir_parents(q, 0755);
mkdir_p(q, 0600);
return r;
}
+#define PARTITION_TABLE_BLURB \
+ "Note that the disk image needs to either contain only a single MBR partition of\n" \
+ "type 0x83 that is marked bootable, or a single GPT partition of type " \
+ "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
+ " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
+ "to be bootable with systemd-nspawn."
+
static int dissect_image(
int fd,
char **root_device, bool *root_device_rw,
#ifdef GPT_ROOT_SECONDARY
int secondary_root_nr = -1;
#endif
-
- _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
+ _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
_cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
_cleanup_udev_device_unref_ struct udev_device *d = NULL;
_cleanup_blkid_free_probe_ blkid_probe b = NULL;
_cleanup_udev_unref_ struct udev *udev = NULL;
struct udev_list_entry *first, *item;
- bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
+ bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
+ bool is_gpt, is_mbr, multiple_generic = false;
const char *pttype = NULL;
blkid_partlist pl;
struct stat st;
+ unsigned i;
int r;
assert(fd >= 0);
errno = 0;
r = blkid_do_safeprobe(b);
if (r == -2 || r == 1) {
- log_error("Failed to identify any partition table on %s.\n"
- "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
+ log_error("Failed to identify any partition table on\n"
+ " %s\n"
+ PARTITION_TABLE_BLURB, arg_image);
return -EINVAL;
} else if (r != 0) {
if (errno == 0)
}
blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
- if (!streq_ptr(pttype, "gpt")) {
- log_error("Image %s does not carry a GUID Partition Table.\n"
- "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
+
+ is_gpt = streq_ptr(pttype, "gpt");
+ is_mbr = streq_ptr(pttype, "dos");
+
+ if (!is_gpt && !is_mbr) {
+ log_error("No GPT or MBR partition table discovered on\n"
+ " %s\n"
+ PARTITION_TABLE_BLURB, arg_image);
return -EINVAL;
}
if (!d)
return log_oom();
- e = udev_enumerate_new(udev);
- if (!e)
- return log_oom();
+ for (i = 0;; i++) {
+ int n, m;
- r = udev_enumerate_add_match_parent(e, d);
- if (r < 0)
- return log_oom();
+ if (i >= 10) {
+ log_error("Kernel partitions never appeared.");
+ return -ENXIO;
+ }
- r = udev_enumerate_scan_devices(e);
- if (r < 0)
- return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
+ e = udev_enumerate_new(udev);
+ if (!e)
+ return log_oom();
+
+ r = udev_enumerate_add_match_parent(e, d);
+ if (r < 0)
+ return log_oom();
+
+ r = udev_enumerate_scan_devices(e);
+ if (r < 0)
+ return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
+
+ /* Count the partitions enumerated by the kernel */
+ n = 0;
+ first = udev_enumerate_get_list_entry(e);
+ udev_list_entry_foreach(item, first)
+ n++;
+
+ /* Count the partitions enumerated by blkid */
+ m = blkid_partlist_numof_partitions(pl);
+ if (n == m + 1)
+ break;
+ if (n > m + 1) {
+ log_error("blkid and kernel partition list do not match.");
+ return -EIO;
+ }
+ if (n < m + 1) {
+ unsigned j;
+
+ /* The kernel has probed fewer partitions than
+ * blkid? Maybe the kernel prober is still
+ * running or it got EBUSY because udev
+ * already opened the device. Let's reprobe
+ * the device, which is a synchronous call
+ * that waits until probing is complete. */
+
+ for (j = 0; j < 20; j++) {
+
+ r = ioctl(fd, BLKRRPART, 0);
+ if (r < 0)
+ r = -errno;
+ if (r >= 0 || r != -EBUSY)
+ break;
+
+ /* If something else has the device
+ * open, such as an udev rule, the
+ * ioctl will return EBUSY. Since
+ * there's no way to wait until it
+ * isn't busy anymore, let's just wait
+ * a bit, and try again.
+ *
+ * This is really something they
+ * should fix in the kernel! */
+
+ usleep(50 * USEC_PER_MSEC);
+ }
+
+ if (r < 0)
+ return log_error_errno(r, "Failed to reread partition table: %m");
+ }
+
+ e = udev_enumerate_unref(e);
+ }
first = udev_enumerate_get_list_entry(e);
udev_list_entry_foreach(item, first) {
_cleanup_udev_device_unref_ struct udev_device *q;
- const char *stype, *node;
+ const char *node;
unsigned long long flags;
- sd_id128_t type_id;
blkid_partition pp;
dev_t qn;
int nr;
continue;
flags = blkid_partition_get_flags(pp);
- if (flags & GPT_FLAG_NO_AUTO)
- continue;
nr = blkid_partition_get_partno(pp);
if (nr < 0)
continue;
- stype = blkid_partition_get_type_string(pp);
- if (!stype)
- continue;
+ if (is_gpt) {
+ sd_id128_t type_id;
+ const char *stype;
- if (sd_id128_from_string(stype, &type_id) < 0)
- continue;
+ if (flags & GPT_FLAG_NO_AUTO)
+ continue;
- if (sd_id128_equal(type_id, GPT_HOME)) {
+ stype = blkid_partition_get_type_string(pp);
+ if (!stype)
+ continue;
- if (home && nr >= home_nr)
+ if (sd_id128_from_string(stype, &type_id) < 0)
continue;
- home_nr = nr;
- home_rw = !(flags & GPT_FLAG_READ_ONLY);
+ if (sd_id128_equal(type_id, GPT_HOME)) {
- free(home);
- home = strdup(node);
- if (!home)
- return log_oom();
- } else if (sd_id128_equal(type_id, GPT_SRV)) {
+ if (home && nr >= home_nr)
+ continue;
- if (srv && nr >= srv_nr)
- continue;
+ home_nr = nr;
+ home_rw = !(flags & GPT_FLAG_READ_ONLY);
- srv_nr = nr;
- srv_rw = !(flags & GPT_FLAG_READ_ONLY);
+ r = free_and_strdup(&home, node);
+ if (r < 0)
+ return log_oom();
- free(srv);
- srv = strdup(node);
- if (!srv)
- return log_oom();
- }
+ } else if (sd_id128_equal(type_id, GPT_SRV)) {
+
+ if (srv && nr >= srv_nr)
+ continue;
+
+ srv_nr = nr;
+ srv_rw = !(flags & GPT_FLAG_READ_ONLY);
+
+ r = free_and_strdup(&srv, node);
+ if (r < 0)
+ return log_oom();
+ }
#ifdef GPT_ROOT_NATIVE
- else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
+ else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
- if (root && nr >= root_nr)
- continue;
+ if (root && nr >= root_nr)
+ continue;
- root_nr = nr;
- root_rw = !(flags & GPT_FLAG_READ_ONLY);
+ root_nr = nr;
+ root_rw = !(flags & GPT_FLAG_READ_ONLY);
- free(root);
- root = strdup(node);
- if (!root)
- return log_oom();
- }
+ r = free_and_strdup(&root, node);
+ if (r < 0)
+ return log_oom();
+ }
#endif
#ifdef GPT_ROOT_SECONDARY
- else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
+ else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
+
+ if (secondary_root && nr >= secondary_root_nr)
+ continue;
+
+ secondary_root_nr = nr;
+ secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
+
+ r = free_and_strdup(&secondary_root, node);
+ if (r < 0)
+ return log_oom();
+ }
+#endif
+ else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
+
+ if (generic)
+ multiple_generic = true;
+ else {
+ generic_rw = !(flags & GPT_FLAG_READ_ONLY);
- if (secondary_root && nr >= secondary_root_nr)
+ r = free_and_strdup(&generic, node);
+ if (r < 0)
+ return log_oom();
+ }
+ }
+
+ } else if (is_mbr) {
+ int type;
+
+ if (flags != 0x80) /* Bootable flag */
continue;
- secondary_root_nr = nr;
- secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
+ type = blkid_partition_get_type(pp);
+ if (type != 0x83) /* Linux partition */
+ continue;
+ if (generic)
+ multiple_generic = true;
+ else {
+ generic_rw = true;
- free(secondary_root);
- secondary_root = strdup(node);
- if (!secondary_root)
- return log_oom();
+ r = free_and_strdup(&root, node);
+ if (r < 0)
+ return log_oom();
+ }
}
-#endif
- }
-
- if (!root && !secondary_root) {
- log_error("Failed to identify root partition in disk image %s.\n"
- "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
- return -EINVAL;
}
if (root) {
*root_device_rw = secondary_root_rw;
*secondary = true;
+ } else if (generic) {
+
+ /* There were no partitions with precise meanings
+ * around, but we found generic partitions. In this
+ * case, if there's only one, we can go ahead and boot
+ * it, otherwise we bail out, because we really cannot
+ * make any sense of it. */
+
+ if (multiple_generic) {
+ log_error("Identified multiple bootable Linux partitions on\n"
+ " %s\n"
+ PARTITION_TABLE_BLURB, arg_image);
+ return -EINVAL;
+ }
+
+ *root_device = generic;
+ generic = NULL;
+
+ *root_device_rw = generic_rw;
+ *secondary = false;
+ } else {
+ log_error("Failed to identify root partition in disk image\n"
+ " %s\n"
+ PARTITION_TABLE_BLURB, arg_image);
+ return -EINVAL;
}
if (home) {
rw = false;
if (directory)
- p = strappenda(where, directory);
+ p = strjoina(where, directory);
else
p = where;
if (image_fd && *image_fd >= 0) {
r = ioctl(*image_fd, LOOP_CLR_FD);
if (r < 0)
- log_warning_errno(errno, "Failed to close loop image: %m");
+ log_debug_errno(errno, "Failed to close loop image: %m");
*image_fd = safe_close(*image_fd);
}
r = ioctl(control, LOOP_CTL_REMOVE, nr);
if (r < 0)
- log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
+ log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
}
static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
pid = PTR_TO_UINT32(userdata);
if (pid > 0) {
- if (kill(pid, SIGRTMIN+3) >= 0) {
+ if (kill(pid, arg_kill_signal) >= 0) {
log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
sd_event_source_set_userdata(s, NULL);
return 0;
return -ENOENT;
}
- if (i->type == IMAGE_GPT)
+ if (i->type == IMAGE_RAW)
r = set_sanitized_path(&arg_image, i->path);
else
r = set_sanitized_path(&arg_directory, i->path);
return 0;
}
+static int determine_uid_shift(void) {
+ int r;
+
+ if (!arg_userns)
+ return 0;
+
+ if (arg_uid_shift == UID_INVALID) {
+ struct stat st;
+
+ r = stat(arg_directory, &st);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
+
+ arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
+
+ if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
+ log_error("UID and GID base of %s don't match.", arg_directory);
+ return -EINVAL;
+ }
+
+ arg_uid_range = UINT32_C(0x10000);
+ }
+
+ if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
+ log_error("UID base too high for UID range.");
+ return -EINVAL;
+ }
+
+ log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
+ return 0;
+}
+
int main(int argc, char *argv[]) {
_cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
int ret = EXIT_SUCCESS;
union in_addr_union exposed = {};
_cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
+ bool interactive;
log_parse_environment();
log_open();
}
if (arg_ephemeral) {
- _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
char *np;
/* If the specified path is a mount point we
if (!arg_quiet)
log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
} else if (r < 0) {
- log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
+ log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
goto finish;
} else {
if (!arg_quiet)
} else {
const char *p;
- p = strappenda(arg_directory,
+ p = strjoina(arg_directory,
argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
if (access(p, F_OK) < 0) {
log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
goto finish;
}
+ r = determine_uid_shift();
+ if (r < 0)
+ goto finish;
+
+ interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
+
master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
if (master < 0) {
r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
goto finish;
}
- if (!arg_quiet)
- log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
- arg_machine, arg_image ?: arg_directory);
-
if (unlockpt(master) < 0) {
r = log_error_errno(errno, "Failed to unlock tty: %m");
goto finish;
}
+ if (!arg_quiet)
+ log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
+ arg_machine, arg_image ?: arg_directory);
+
assert_se(sigemptyset(&mask) == 0);
sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
master = safe_close(master);
- close_nointr(STDIN_FILENO);
- close_nointr(STDOUT_FILENO);
- close_nointr(STDERR_FILENO);
-
kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
reset_all_signal_handlers();
reset_signal_mask();
- r = open_terminal(console, O_RDWR);
- if (r != STDIN_FILENO) {
- if (r >= 0) {
- safe_close(r);
- r = -EINVAL;
- }
+ if (interactive) {
+ close_nointr(STDIN_FILENO);
+ close_nointr(STDOUT_FILENO);
+ close_nointr(STDERR_FILENO);
- log_error_errno(r, "Failed to open console: %m");
- _exit(EXIT_FAILURE);
- }
+ r = open_terminal(console, O_RDWR);
+ if (r != STDIN_FILENO) {
+ if (r >= 0) {
+ safe_close(r);
+ r = -EINVAL;
+ }
- if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
- dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
- log_error_errno(errno, "Failed to duplicate console: %m");
- _exit(EXIT_FAILURE);
+ log_error_errno(r, "Failed to open console: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
+ dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
+ log_error_errno(errno, "Failed to duplicate console: %m");
+ _exit(EXIT_FAILURE);
+ }
}
if (setsid() < 0) {
_exit(EXIT_FAILURE);
}
+ if (arg_private_network)
+ loopback_setup();
+
/* Mark everything as slave, so that we still
* receive mounts from the real root, but don't
* propagate mounts to the real root. */
/* Tell the parent that we are ready, and that
* it can cgroupify us to that we lack access
* to certain devices and resources. */
- (void) barrier_place(&barrier);
+ (void) barrier_place(&barrier); /* #1 */
if (setup_boot_id(arg_directory) < 0)
_exit(EXIT_FAILURE);
/* Wait until we are cgroup-ified, so that we
* can mount the right cgroup path writable */
- (void) barrier_sync_next(&barrier);
+ (void) barrier_place_and_sync(&barrier); /* #2 */
if (mount_cgroup(arg_directory) < 0)
_exit(EXIT_FAILURE);
_exit(EXIT_FAILURE);
}
- umask(0022);
+ if (arg_userns) {
+ if (unshare(CLONE_NEWUSER) < 0) {
+ log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
+ _exit(EXIT_FAILURE);
+ }
- if (arg_private_network)
- loopback_setup();
+ /* Tell the parent, that it now can
+ * write the UID map. */
+ (void) barrier_place(&barrier); /* #3 */
+
+ /* Wait until the parent wrote the UID
+ * map */
+ (void) barrier_place_and_sync(&barrier); /* #4 */
+ }
+
+ umask(0022);
if (drop_capabilities() < 0) {
log_error_errno(errno, "drop_capabilities() failed: %m");
_exit(EXIT_FAILURE);
}
+ setup_hostname();
+
+ if (arg_personality != 0xffffffffLU) {
+ if (personality(arg_personality) < 0) {
+ log_error_errno(errno, "personality() failed: %m");
+ _exit(EXIT_FAILURE);
+ }
+ } else if (secondary) {
+ if (personality(PER_LINUX32) < 0) {
+ log_error_errno(errno, "personality() failed: %m");
+ _exit(EXIT_FAILURE);
+ }
+ }
+
+#ifdef HAVE_SELINUX
+ if (arg_selinux_context)
+ if (setexeccon((security_context_t) arg_selinux_context) < 0) {
+ log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
+ _exit(EXIT_FAILURE);
+ }
+#endif
+
r = change_uid_gid(&home);
if (r < 0)
_exit(EXIT_FAILURE);
}
}
- setup_hostname();
-
- if (arg_personality != 0xffffffffLU) {
- if (personality(arg_personality) < 0) {
- log_error_errno(errno, "personality() failed: %m");
- _exit(EXIT_FAILURE);
- }
- } else if (secondary) {
- if (personality(PER_LINUX32) < 0) {
- log_error_errno(errno, "personality() failed: %m");
- _exit(EXIT_FAILURE);
- }
- }
-
-#ifdef HAVE_SELINUX
- if (arg_selinux_context)
- if (setexeccon((security_context_t) arg_selinux_context) < 0) {
- log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
- _exit(EXIT_FAILURE);
- }
-#endif
-
if (!strv_isempty(arg_setenv)) {
char **n;
} else
env_use = (char**) envp;
- /* Wait until the parent is ready with the setup, too... */
- if (!barrier_place_and_sync(&barrier))
- _exit(EXIT_FAILURE);
+ /* Let the parent know that we are ready and
+ * wait until the parent is ready with the
+ * setup, too... */
+ (void) barrier_place_and_sync(&barrier); /* #5 */
if (arg_boot) {
char **a;
kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
+ (void) barrier_place(&barrier); /* #1 */
+
/* Wait for the most basic Child-setup to be done,
* before we add hardware to it, and place it in a
* cgroup. */
- if (barrier_sync_next(&barrier)) {
+ if (barrier_sync(&barrier)) { /* #1 */
int ifi = 0;
r = move_network_interfaces(pid);
if (r < 0)
goto finish;
+ r = setup_ipvlan(pid);
+ if (r < 0)
+ goto finish;
+
r = register_machine(pid, ifi);
if (r < 0)
goto finish;
+ /* Notify the child that the parent is ready with all
+ * its setup, and that the child can now hand over
+ * control to the code to run inside the container. */
+ (void) barrier_place(&barrier); /* #2 */
+
+ if (arg_userns) {
+ char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+
+ (void) barrier_place_and_sync(&barrier); /* #3 */
+
+ xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
+ xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
+ r = write_string_file(uid_map, line);
+ if (r < 0) {
+ log_error_errno(r, "Failed to write UID map: %m");
+ goto finish;
+ }
+
+ /* We always assign the same UID and GID ranges */
+ xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
+ r = write_string_file(uid_map, line);
+ if (r < 0) {
+ log_error_errno(r, "Failed to write GID map: %m");
+ goto finish;
+ }
+
+ (void) barrier_place(&barrier); /* #4 */
+ }
+
/* Block SIGCHLD here, before notifying child.
* process_pty() will handle it with the other signals. */
r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
if (r < 0)
goto finish;
- /* Notify the child that the parent is ready with all
- * its setup, and that the child can now hand over
- * control to the code to run inside the container. */
- (void) barrier_place(&barrier);
-
- /* And wait that the child is completely ready now. */
- if (barrier_place_and_sync(&barrier)) {
+ /* Let the child know that we are ready and wait that the child is completely ready now. */
+ if (barrier_place_and_sync(&barrier)) { /* #5 */
_cleanup_event_unref_ sd_event *event = NULL;
_cleanup_(pty_forward_freep) PTYForward *forward = NULL;
_cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
char last_char = 0;
- sd_notify(false,
- "READY=1\n"
- "STATUS=Container running.");
+ sd_notifyf(false,
+ "READY=1\n"
+ "STATUS=Container running.\n"
+ "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
r = sd_event_new(&event);
if (r < 0) {
goto finish;
}
- if (arg_boot) {
+ if (arg_kill_signal > 0) {
/* Try to kill the init system on SIGINT or SIGTERM */
sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
- r = pty_forward_new(event, master, true, &forward);
+ r = pty_forward_new(event, master, true, !interactive, &forward);
if (r < 0) {
log_error_errno(r, "Failed to create PTY forwarder: %m");
goto finish;
if (arg_machine) {
const char *p;
- p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
+ p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
(void) rm_rf(p, false, true, false);
}
strv_free(arg_setenv);
strv_free(arg_network_interfaces);
strv_free(arg_network_macvlan);
+ strv_free(arg_network_ipvlan);
strv_free(arg_bind);
strv_free(arg_bind_ro);
strv_free(arg_tmpfs);