chiark / gitweb /
nspawn: make kill signal to use for PID 1 configurable
[elogind.git] / src / nspawn / nspawn.c
index 3fce3ad77f0c72cd4081dbe74c9e2faf705ebd08..8ce5fbeb629f6b25db1bd7cd0c297bf21c0e5067 100644 (file)
 #include <sched.h>
 #include <unistd.h>
 #include <sys/types.h>
-#include <sys/syscall.h>
 #include <sys/mount.h>
-#include <sys/wait.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <errno.h>
 #include <sys/prctl.h>
 #include <getopt.h>
-#include <termios.h>
-#include <sys/signalfd.h>
 #include <grp.h>
 #include <linux/fs.h>
-#include <sys/un.h>
 #include <sys/socket.h>
 #include <linux/netlink.h>
 #include <net/if.h>
 #include <linux/veth.h>
 #include <sys/personality.h>
 #include <linux/loop.h>
+#include <sys/file.h>
 
 #ifdef HAVE_SELINUX
 #include <selinux/selinux.h>
@@ -64,7 +60,6 @@
 #include "util.h"
 #include "mkdir.h"
 #include "macro.h"
-#include "audit.h"
 #include "missing.h"
 #include "cgroup-util.h"
 #include "strv.h"
@@ -77,9 +72,7 @@
 #include "bus-util.h"
 #include "bus-error.h"
 #include "ptyfwd.h"
-#include "bus-kernel.h"
 #include "env-util.h"
-#include "def.h"
 #include "rtnl-util.h"
 #include "udev-util.h"
 #include "blkid-util.h"
@@ -178,12 +171,17 @@ static bool arg_register = true;
 static bool arg_keep_unit = false;
 static char **arg_network_interfaces = NULL;
 static char **arg_network_macvlan = NULL;
+static char **arg_network_ipvlan = NULL;
 static bool arg_network_veth = false;
 static const char *arg_network_bridge = NULL;
 static unsigned long arg_personality = 0xffffffffLU;
 static char *arg_image = NULL;
 static Volatile arg_volatile = VOLATILE_NO;
 static ExposePort *arg_expose_ports = NULL;
+static char **arg_property = NULL;
+static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
+static bool arg_userns = false;
+static int arg_kill_signal = 0;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -202,6 +200,7 @@ static void help(void) {
                "  -M --machine=NAME         Set the machine name for the container\n"
                "     --uuid=UUID            Set a specific machine UUID for the container\n"
                "  -S --slice=SLICE          Place the container in the specified slice\n"
+               "     --property=NAME=VALUE  Set scope unit property\n"
                "     --private-network      Disable network in container\n"
                "     --network-interface=INTERFACE\n"
                "                            Assign an existing network interface to the\n"
@@ -209,12 +208,17 @@ static void help(void) {
                "     --network-macvlan=INTERFACE\n"
                "                            Create a macvlan network interface based on an\n"
                "                            existing network interface to the container\n"
+               "     --network-ipvlan=INTERFACE\n"
+               "                            Create a ipvlan network interface based on an\n"
+               "                            existing network interface to the container\n"
                "  -n --network-veth         Add a virtual ethernet connection between host\n"
                "                            and container\n"
                "     --network-bridge=INTERFACE\n"
                "                            Add a virtual ethernet connection between host\n"
                "                            and container and add it to an existing bridge on\n"
                "                            the host\n"
+               "     --private-users[=UIDBASE[:NUIDS]]\n"
+               "                            Run within user namespace\n"
                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
                "                            Expose a container IP port on the host\n"
                "  -Z --selinux-context=SECLABEL\n"
@@ -226,6 +230,7 @@ static void help(void) {
                "     --capability=CAP       In addition to the default, retain specified\n"
                "                            capability\n"
                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
+               "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
                "                            try-guest, try-host\n"
                "  -j                        Equivalent to --link-journal=try-guest\n"
@@ -283,10 +288,14 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_KEEP_UNIT,
                 ARG_NETWORK_INTERFACE,
                 ARG_NETWORK_MACVLAN,
+                ARG_NETWORK_IPVLAN,
                 ARG_NETWORK_BRIDGE,
                 ARG_PERSONALITY,
                 ARG_VOLATILE,
                 ARG_TEMPLATE,
+                ARG_PROPERTY,
+                ARG_PRIVATE_USERS,
+                ARG_KILL_SIGNAL,
         };
 
         static const struct option options[] = {
@@ -317,12 +326,16 @@ static int parse_argv(int argc, char *argv[]) {
                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
+                { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
                 { "network-veth",          no_argument,       NULL, 'n'                   },
                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
                 { "image",                 required_argument, NULL, 'i'                   },
                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
                 { "port",                  required_argument, NULL, 'p'                   },
+                { "property",              required_argument, NULL, ARG_PROPERTY          },
+                { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
+                { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL       },
                 {}
         };
 
@@ -399,6 +412,13 @@ static int parse_argv(int argc, char *argv[]) {
                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
                                 return log_oom();
 
+                        arg_private_network = true;
+                        break;
+
+                case ARG_NETWORK_IPVLAN:
+                        if (strv_extend(&arg_network_ipvlan, optarg) < 0)
+                                return log_oom();
+
                         /* fall through */
 
                 case ARG_PRIVATE_NETWORK:
@@ -716,6 +736,50 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
                 }
 
+                case ARG_PROPERTY:
+                        if (strv_extend(&arg_property, optarg) < 0)
+                                return log_oom();
+
+                        break;
+
+                case ARG_PRIVATE_USERS:
+                        if (optarg) {
+                                _cleanup_free_ char *buffer = NULL;
+                                const char *range, *shift;
+
+                                range = strchr(optarg, ':');
+                                if (range) {
+                                        buffer = strndup(optarg, range - optarg);
+                                        if (!buffer)
+                                                return log_oom();
+                                        shift = buffer;
+
+                                        range++;
+                                        if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
+                                                log_error("Failed to parse UID range: %s", range);
+                                                return -EINVAL;
+                                        }
+                                } else
+                                        shift = optarg;
+
+                                if (parse_uid(shift, &arg_uid_shift) < 0) {
+                                        log_error("Failed to parse UID: %s", optarg);
+                                        return -EINVAL;
+                                }
+                        }
+
+                        arg_userns = true;
+                        break;
+
+                case ARG_KILL_SIGNAL:
+                        arg_kill_signal = signal_from_string_try_harder(optarg);
+                        if (arg_kill_signal < 0) {
+                                log_error("Cannot parse signal: %s", optarg);
+                                return -EINVAL;
+                        }
+
+                        break;
+
                 case '?':
                         return -EINVAL;
 
@@ -778,6 +842,9 @@ static int parse_argv(int argc, char *argv[]) {
 
         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
 
+        if (arg_boot && arg_kill_signal <= 0)
+                arg_kill_signal = SIGRTMIN+3;
+
         return 1;
 }
 
@@ -801,6 +868,7 @@ static int mount_all(const char *dest) {
                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
+                { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
 #ifdef HAVE_SELINUX
                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
@@ -811,10 +879,7 @@ static int mount_all(const char *dest) {
         int r = 0;
 
         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
-                _cleanup_free_ char *where = NULL;
-#ifdef HAVE_SELINUX
-                _cleanup_free_ char *options = NULL;
-#endif
+                _cleanup_free_ char *where = NULL, *options = NULL;
                 const char *o;
                 int t;
 
@@ -861,6 +926,19 @@ static int mount_all(const char *dest) {
 #endif
                         o = mount_table[k].options;
 
+                if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
+                        char *uid_options = NULL;
+
+                        if (o)
+                                asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
+                        else
+                                asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
+                        if (!uid_options)
+                                return log_oom();
+
+                        free(options);
+                        o = options = uid_options;
+                }
 
                 if (mount(mount_table[k].what,
                           where,
@@ -898,8 +976,12 @@ static int mount_binds(const char *dest, char **l, bool ro) {
 
                 r = stat(where, &dest_st);
                 if (r == 0) {
-                        if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
-                                log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
+                        if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
+                                log_error("Cannot bind mount directory %s on file %s.", *x, where);
+                                return -EINVAL;
+                        }
+                        if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
+                                log_error("Cannot bind mount file %s on directory %s.", *x, where);
                                 return -EINVAL;
                         }
                 } else if (errno == ENOENT) {
@@ -911,27 +993,18 @@ static int mount_binds(const char *dest, char **l, bool ro) {
                         return -errno;
                 }
 
-                /* Create the mount point, but be conservative -- refuse to create block
-                 * and char devices. */
+                /* Create the mount point. Any non-directory file can be
+                 * mounted on any non-directory file (regular, fifo, socket,
+                 * char, block).
+                 */
                 if (S_ISDIR(source_st.st_mode)) {
                         r = mkdir_label(where, 0755);
                         if (r < 0 && errno != EEXIST)
                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
-                } else if (S_ISFIFO(source_st.st_mode)) {
-                        r = mkfifo(where, 0644);
-                        if (r < 0 && errno != EEXIST)
-                                return log_error_errno(errno, "Failed to create mount point %s: %m", where);
-                } else if (S_ISSOCK(source_st.st_mode)) {
-                        r = mknod(where, 0644 | S_IFSOCK, 0);
-                        if (r < 0 && errno != EEXIST)
-                                return log_error_errno(errno, "Failed to create mount point %s: %m", where);
-                } else if (S_ISREG(source_st.st_mode)) {
+                } else {
                         r = touch(where);
                         if (r < 0)
                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
-                } else {
-                        log_error("Refusing to create mountpoint for file: %s", *x);
-                        return -ENOTSUP;
                 }
 
                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
@@ -951,7 +1024,7 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons
         char *to;
         int r;
 
-        to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
+        to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
 
         r = path_is_mount_point(to, false);
         if (r < 0)
@@ -961,9 +1034,17 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons
 
         mkdir_p(to, 0755);
 
-        if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
+        /* The superblock mount options of the mount point need to be
+         * identical to the hosts', and hence writable... */
+        if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
 
+        /* ... hence let's only make the bind mount read-only, not the
+         * superblock. */
+        if (read_only) {
+                if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
+                        return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
+        }
         return 1;
 }
 
@@ -985,7 +1066,7 @@ static int mount_cgroup(const char *dest) {
         if (r < 0)
                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
 
-        cgroup_root = strappenda(dest, "/sys/fs/cgroup");
+        cgroup_root = strjoina(dest, "/sys/fs/cgroup");
         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
 
@@ -1029,21 +1110,21 @@ static int mount_cgroup(const char *dest) {
                                 return r;
 
                         if (symlink(combined, target) < 0)
-                                return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
+                                return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
                 }
         }
 
-        r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
+        r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
         if (r < 0)
                 return r;
 
         /* Make our own cgroup a (writable) bind mount */
-        systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+        systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
 
         /* And then remount the systemd cgroup root read-only */
-        systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
+        systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
 
@@ -1196,7 +1277,7 @@ static int setup_volatile_state(const char *directory) {
         if (r < 0)
                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
 
-        p = strappenda(directory, "/var");
+        p = strjoina(directory, "/var");
         r = mkdir(p, 0755);
         if (r < 0 && errno != EEXIST)
                 return log_error_errno(errno, "Failed to create %s: %m", directory);
@@ -1232,8 +1313,8 @@ static int setup_volatile(const char *directory) {
 
         tmpfs_mounted = true;
 
-        f = strappenda(directory, "/usr");
-        t = strappenda(template, "/usr");
+        f = strjoina(directory, "/usr");
+        t = strjoina(template, "/usr");
 
         r = mkdir(t, 0755);
         if (r < 0 && errno != EEXIST) {
@@ -1370,6 +1451,10 @@ static int copy_devnodes(const char *dest) {
 
                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
+
+                        if (arg_userns && arg_uid_shift != UID_INVALID)
+                                if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
+                                        return log_error_errno(errno, "chown() of device node %s failed: %m", to);
                 }
         }
 
@@ -1386,6 +1471,10 @@ static int setup_ptmx(const char *dest) {
         if (symlink("pts/ptmx", p) < 0)
                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
 
+        if (arg_userns && arg_uid_shift != UID_INVALID)
+                if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
+                        return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
+
         return 0;
 }
 
@@ -1415,7 +1504,7 @@ static int setup_dev_console(const char *dest, const char *console) {
          * /dev/console. (Note that the major minor doesn't actually
          * matter here, since we mount it over anyway). */
 
-        to = strappenda(dest, "/dev/console");
+        to = strjoina(dest, "/dev/console");
         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
 
@@ -1655,7 +1744,7 @@ static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed
         cmsg = CMSG_FIRSTHDR(&mh);
         assert(cmsg->cmsg_level == SOL_SOCKET);
         assert(cmsg->cmsg_type == SCM_RIGHTS);
-        assert(cmsg->cmsg_len = CMSG_LEN(sizeof(int)));
+        assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
 
         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
@@ -1878,6 +1967,7 @@ static int register_machine(pid_t pid, int local_ifindex) {
                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
         } else {
                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
+                char **i;
 
                 r = sd_bus_message_new_method_call(
                                 bus,
@@ -1887,7 +1977,7 @@ static int register_machine(pid_t pid, int local_ifindex) {
                                 "org.freedesktop.machine1.Manager",
                                 "CreateMachineWithNetwork");
                 if (r < 0)
-                        return log_error_errno(r, "Failed to create message: %m");
+                        return bus_log_create_error(r);
 
                 r = sd_bus_message_append(
                                 m,
@@ -1900,21 +1990,21 @@ static int register_machine(pid_t pid, int local_ifindex) {
                                 strempty(arg_directory),
                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to append message arguments: %m");
+                        return bus_log_create_error(r);
 
                 r = sd_bus_message_open_container(m, 'a', "(sv)");
                 if (r < 0)
-                        return log_error_errno(r, "Failed to open container: %m");
+                        return bus_log_create_error(r);
 
                 if (!isempty(arg_slice)) {
                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
                         if (r < 0)
-                                return log_error_errno(r, "Failed to append slice: %m");
+                                return bus_log_create_error(r);
                 }
 
                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
                 if (r < 0)
-                        return log_error_errno(r, "Failed to add device policy: %m");
+                        return bus_log_create_error(r);
 
                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
                                           /* Allow the container to
@@ -1940,9 +2030,23 @@ static int register_machine(pid_t pid, int local_ifindex) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to add device whitelist: %m");
 
+                STRV_FOREACH(i, arg_property) {
+                        r = sd_bus_message_open_container(m, 'r', "sv");
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = bus_append_unit_property_assignment(m, *i);
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_close_container(m);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+                }
+
                 r = sd_bus_message_close_container(m);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to close container: %m");
+                        return bus_log_create_error(r);
 
                 r = sd_bus_call(bus, m, 0, &error, NULL);
         }
@@ -2379,21 +2483,105 @@ static int setup_macvlan(pid_t pid) {
         return 0;
 }
 
+static int setup_ipvlan(pid_t pid) {
+        _cleanup_udev_unref_ struct udev *udev = NULL;
+        _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+        char **i;
+        int r;
+
+        if (!arg_private_network)
+                return 0;
+
+        if (strv_isempty(arg_network_ipvlan))
+                return 0;
+
+        r = sd_rtnl_open(&rtnl, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to connect to netlink: %m");
+
+        udev = udev_new();
+        if (!udev) {
+                log_error("Failed to connect to udev.");
+                return -ENOMEM;
+        }
+
+        STRV_FOREACH(i, arg_network_ipvlan) {
+                _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
+                _cleanup_free_ char *n = NULL;
+                int ifi;
+
+                ifi = parse_interface(udev, *i);
+                if (ifi < 0)
+                        return ifi;
+
+                r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate netlink message: %m");
+
+                r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add netlink interface index: %m");
+
+                n = strappend("iv-", *i);
+                if (!n)
+                        return log_oom();
+
+                strshorten(n, IFNAMSIZ-1);
+
+                r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add netlink interface name: %m");
+
+                r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add netlink namespace field: %m");
+
+                r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open netlink container: %m");
+
+                r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open netlink container: %m");
+
+                r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add ipvlan mode: %m");
+
+                r = sd_rtnl_message_close_container(m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to close netlink container: %m");
+
+                r = sd_rtnl_message_close_container(m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to close netlink container: %m");
+
+                r = sd_rtnl_call(rtnl, m, 0, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
+        }
+
+        return 0;
+}
+
 static int setup_seccomp(void) {
 
 #ifdef HAVE_SECCOMP
         static const int blacklist[] = {
                 SCMP_SYS(kexec_load),
                 SCMP_SYS(open_by_handle_at),
-                SCMP_SYS(init_module),
-                SCMP_SYS(finit_module),
-                SCMP_SYS(delete_module),
                 SCMP_SYS(iopl),
                 SCMP_SYS(ioperm),
                 SCMP_SYS(swapon),
                 SCMP_SYS(swapoff),
         };
 
+        static const int kmod_blacklist[] = {
+                SCMP_SYS(init_module),
+                SCMP_SYS(finit_module),
+                SCMP_SYS(delete_module),
+        };
+
         scmp_filter_ctx seccomp;
         unsigned i;
         int r;
@@ -2418,6 +2606,20 @@ static int setup_seccomp(void) {
                 }
         }
 
+        /* If the CAP_SYS_MODULE capability is not requested then
+         * we'll block the kmod syscalls too */
+        if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
+                for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
+                        r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
+                        if (r == -EFAULT)
+                                continue; /* unknown syscall */
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to block syscall: %m");
+                                goto finish;
+                        }
+                }
+        }
+
         /*
            Audit is broken in containers, much of the userspace audit
            hookup will fail if running inside a container. We don't
@@ -2464,10 +2666,10 @@ static int setup_propagate(const char *root) {
 
         (void) mkdir_p("/run/systemd/nspawn/", 0755);
         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
-        p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
+        p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
         (void) mkdir_p(p, 0600);
 
-        q = strappenda(root, "/run/systemd/nspawn/incoming");
+        q = strjoina(root, "/run/systemd/nspawn/incoming");
         mkdir_parents(q, 0755);
         mkdir_p(q, 0600);
 
@@ -2557,6 +2759,13 @@ static int setup_image(char **device_path, int *loop_nr) {
         return r;
 }
 
+#define PARTITION_TABLE_BLURB \
+        "Note that the disk image needs to either contain only a single MBR partition of\n" \
+        "type 0x83 that is marked bootable, or a single GPT partition of type " \
+        "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
+        "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
+        "to be bootable with systemd-nspawn."
+
 static int dissect_image(
                 int fd,
                 char **root_device, bool *root_device_rw,
@@ -2572,17 +2781,18 @@ static int dissect_image(
 #ifdef GPT_ROOT_SECONDARY
         int secondary_root_nr = -1;
 #endif
-
-        _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
+        _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
         _cleanup_udev_unref_ struct udev *udev = NULL;
         struct udev_list_entry *first, *item;
-        bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
+        bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
+        bool is_gpt, is_mbr, multiple_generic = false;
         const char *pttype = NULL;
         blkid_partlist pl;
         struct stat st;
+        unsigned i;
         int r;
 
         assert(fd >= 0);
@@ -2612,8 +2822,9 @@ static int dissect_image(
         errno = 0;
         r = blkid_do_safeprobe(b);
         if (r == -2 || r == 1) {
-                log_error("Failed to identify any partition table on %s.\n"
-                          "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
+                log_error("Failed to identify any partition table on\n"
+                          "    %s\n"
+                          PARTITION_TABLE_BLURB, arg_image);
                 return -EINVAL;
         } else if (r != 0) {
                 if (errno == 0)
@@ -2623,9 +2834,14 @@ static int dissect_image(
         }
 
         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
-        if (!streq_ptr(pttype, "gpt")) {
-                log_error("Image %s does not carry a GUID Partition Table.\n"
-                          "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
+
+        is_gpt = streq_ptr(pttype, "gpt");
+        is_mbr = streq_ptr(pttype, "dos");
+
+        if (!is_gpt && !is_mbr) {
+                log_error("No GPT or MBR partition table discovered on\n"
+                          "    %s\n"
+                          PARTITION_TABLE_BLURB, arg_image);
                 return -EINVAL;
         }
 
@@ -2650,24 +2866,83 @@ static int dissect_image(
         if (!d)
                 return log_oom();
 
-        e = udev_enumerate_new(udev);
-        if (!e)
-                return log_oom();
+        for (i = 0;; i++) {
+                int n, m;
 
-        r = udev_enumerate_add_match_parent(e, d);
-        if (r < 0)
-                return log_oom();
+                if (i >= 10) {
+                        log_error("Kernel partitions never appeared.");
+                        return -ENXIO;
+                }
 
-        r = udev_enumerate_scan_devices(e);
-        if (r < 0)
-                return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
+                e = udev_enumerate_new(udev);
+                if (!e)
+                        return log_oom();
+
+                r = udev_enumerate_add_match_parent(e, d);
+                if (r < 0)
+                        return log_oom();
+
+                r = udev_enumerate_scan_devices(e);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
+
+                /* Count the partitions enumerated by the kernel */
+                n = 0;
+                first = udev_enumerate_get_list_entry(e);
+                udev_list_entry_foreach(item, first)
+                        n++;
+
+                /* Count the partitions enumerated by blkid */
+                m = blkid_partlist_numof_partitions(pl);
+                if (n == m + 1)
+                        break;
+                if (n > m + 1) {
+                        log_error("blkid and kernel partition list do not match.");
+                        return -EIO;
+                }
+                if (n < m + 1) {
+                        unsigned j;
+
+                        /* The kernel has probed fewer partitions than
+                         * blkid? Maybe the kernel prober is still
+                         * running or it got EBUSY because udev
+                         * already opened the device. Let's reprobe
+                         * the device, which is a synchronous call
+                         * that waits until probing is complete. */
+
+                        for (j = 0; j < 20; j++) {
+
+                                r = ioctl(fd, BLKRRPART, 0);
+                                if (r < 0)
+                                        r = -errno;
+                                if (r >= 0 || r != -EBUSY)
+                                        break;
+
+                                /* If something else has the device
+                                 * open, such as an udev rule, the
+                                 * ioctl will return EBUSY. Since
+                                 * there's no way to wait until it
+                                 * isn't busy anymore, let's just wait
+                                 * a bit, and try again.
+                                 *
+                                 * This is really something they
+                                 * should fix in the kernel! */
+
+                                usleep(50 * USEC_PER_MSEC);
+                        }
+
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to reread partition table: %m");
+                }
+
+                e = udev_enumerate_unref(e);
+        }
 
         first = udev_enumerate_get_list_entry(e);
         udev_list_entry_foreach(item, first) {
                 _cleanup_udev_device_unref_ struct udev_device *q;
-                const char *stype, *node;
+                const char *node;
                 unsigned long long flags;
-                sd_id128_t type_id;
                 blkid_partition pp;
                 dev_t qn;
                 int nr;
@@ -2698,82 +2973,110 @@ static int dissect_image(
                         continue;
 
                 flags = blkid_partition_get_flags(pp);
-                if (flags & GPT_FLAG_NO_AUTO)
-                        continue;
 
                 nr = blkid_partition_get_partno(pp);
                 if (nr < 0)
                         continue;
 
-                stype = blkid_partition_get_type_string(pp);
-                if (!stype)
-                        continue;
+                if (is_gpt) {
+                        sd_id128_t type_id;
+                        const char *stype;
 
-                if (sd_id128_from_string(stype, &type_id) < 0)
-                        continue;
+                        if (flags & GPT_FLAG_NO_AUTO)
+                                continue;
 
-                if (sd_id128_equal(type_id, GPT_HOME)) {
+                        stype = blkid_partition_get_type_string(pp);
+                        if (!stype)
+                                continue;
 
-                        if (home && nr >= home_nr)
+                        if (sd_id128_from_string(stype, &type_id) < 0)
                                 continue;
 
-                        home_nr = nr;
-                        home_rw = !(flags & GPT_FLAG_READ_ONLY);
+                        if (sd_id128_equal(type_id, GPT_HOME)) {
 
-                        free(home);
-                        home = strdup(node);
-                        if (!home)
-                                return log_oom();
-                } else if (sd_id128_equal(type_id, GPT_SRV)) {
+                                if (home && nr >= home_nr)
+                                        continue;
 
-                        if (srv && nr >= srv_nr)
-                                continue;
+                                home_nr = nr;
+                                home_rw = !(flags & GPT_FLAG_READ_ONLY);
+
+                                r = free_and_strdup(&home, node);
+                                if (r < 0)
+                                        return log_oom();
 
-                        srv_nr = nr;
-                        srv_rw = !(flags & GPT_FLAG_READ_ONLY);
+                        } else if (sd_id128_equal(type_id, GPT_SRV)) {
 
-                        free(srv);
-                        srv = strdup(node);
-                        if (!srv)
-                                return log_oom();
-                }
+                                if (srv && nr >= srv_nr)
+                                        continue;
+
+                                srv_nr = nr;
+                                srv_rw = !(flags & GPT_FLAG_READ_ONLY);
+
+                                r = free_and_strdup(&srv, node);
+                                if (r < 0)
+                                        return log_oom();
+                        }
 #ifdef GPT_ROOT_NATIVE
-                else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
+                        else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
 
-                        if (root && nr >= root_nr)
-                                continue;
+                                if (root && nr >= root_nr)
+                                        continue;
 
-                        root_nr = nr;
-                        root_rw = !(flags & GPT_FLAG_READ_ONLY);
+                                root_nr = nr;
+                                root_rw = !(flags & GPT_FLAG_READ_ONLY);
 
-                        free(root);
-                        root = strdup(node);
-                        if (!root)
-                                return log_oom();
-                }
+                                r = free_and_strdup(&root, node);
+                                if (r < 0)
+                                        return log_oom();
+                        }
 #endif
 #ifdef GPT_ROOT_SECONDARY
-                else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
+                        else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
+
+                                if (secondary_root && nr >= secondary_root_nr)
+                                        continue;
+
+                                secondary_root_nr = nr;
+                                secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
+
+                                r = free_and_strdup(&secondary_root, node);
+                                if (r < 0)
+                                        return log_oom();
+                        }
+#endif
+                        else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
+
+                                if (generic)
+                                        multiple_generic = true;
+                                else {
+                                        generic_rw = !(flags & GPT_FLAG_READ_ONLY);
+
+                                        r = free_and_strdup(&generic, node);
+                                        if (r < 0)
+                                                return log_oom();
+                                }
+                        }
 
-                        if (secondary_root && nr >= secondary_root_nr)
+                } else if (is_mbr) {
+                        int type;
+
+                        if (flags != 0x80) /* Bootable flag */
                                 continue;
 
-                        secondary_root_nr = nr;
-                        secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
+                        type = blkid_partition_get_type(pp);
+                        if (type != 0x83) /* Linux partition */
+                                continue;
 
+                        if (generic)
+                                multiple_generic = true;
+                        else {
+                                generic_rw = true;
 
-                        free(secondary_root);
-                        secondary_root = strdup(node);
-                        if (!secondary_root)
-                                return log_oom();
+                                r = free_and_strdup(&root, node);
+                                if (r < 0)
+                                        return log_oom();
+                        }
                 }
-#endif
-        }
-
-        if (!root && !secondary_root) {
-                log_error("Failed to identify root partition in disk image %s.\n"
-                          "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
-                return -EINVAL;
         }
 
         if (root) {
@@ -2788,6 +3091,31 @@ static int dissect_image(
 
                 *root_device_rw = secondary_root_rw;
                 *secondary = true;
+        } else if (generic) {
+
+                /* There were no partitions with precise meanings
+                 * around, but we found generic partitions. In this
+                 * case, if there's only one, we can go ahead and boot
+                 * it, otherwise we bail out, because we really cannot
+                 * make any sense of it. */
+
+                if (multiple_generic) {
+                        log_error("Identified multiple bootable Linux partitions on\n"
+                                  "    %s\n"
+                                  PARTITION_TABLE_BLURB, arg_image);
+                        return -EINVAL;
+                }
+
+                *root_device = generic;
+                generic = NULL;
+
+                *root_device_rw = generic_rw;
+                *secondary = false;
+        } else {
+                log_error("Failed to identify root partition in disk image\n"
+                          "    %s\n"
+                          PARTITION_TABLE_BLURB, arg_image);
+                return -EINVAL;
         }
 
         if (home) {
@@ -2824,7 +3152,7 @@ static int mount_device(const char *what, const char *where, const char *directo
                 rw = false;
 
         if (directory)
-                p = strappenda(where, directory);
+                p = strjoina(where, directory);
         else
                 p = where;
 
@@ -2915,7 +3243,7 @@ static void loop_remove(int nr, int *image_fd) {
         if (image_fd && *image_fd >= 0) {
                 r = ioctl(*image_fd, LOOP_CLR_FD);
                 if (r < 0)
-                        log_warning_errno(errno, "Failed to close loop image: %m");
+                        log_debug_errno(errno, "Failed to close loop image: %m");
                 *image_fd = safe_close(*image_fd);
         }
 
@@ -2927,7 +3255,7 @@ static void loop_remove(int nr, int *image_fd) {
 
         r = ioctl(control, LOOP_CTL_REMOVE, nr);
         if (r < 0)
-                log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
+                log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
 }
 
 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
@@ -3247,7 +3575,7 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo
 
         pid = PTR_TO_UINT32(userdata);
         if (pid > 0) {
-                if (kill(pid, SIGRTMIN+3) >= 0) {
+                if (kill(pid, arg_kill_signal) >= 0) {
                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
                         sd_event_source_set_userdata(s, NULL);
                         return 0;
@@ -3273,7 +3601,7 @@ static int determine_names(void) {
                                 return -ENOENT;
                         }
 
-                        if (i->type == IMAGE_GPT)
+                        if (i->type == IMAGE_RAW)
                                 r = set_sanitized_path(&arg_image, i->path);
                         else
                                 r = set_sanitized_path(&arg_directory, i->path);
@@ -3324,6 +3652,38 @@ static int determine_names(void) {
         return 0;
 }
 
+static int determine_uid_shift(void) {
+        int r;
+
+        if (!arg_userns)
+                return 0;
+
+        if (arg_uid_shift == UID_INVALID) {
+                struct stat st;
+
+                r = stat(arg_directory, &st);
+                if (r < 0)
+                        return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
+
+                arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
+
+                if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
+                        log_error("UID and GID base of %s don't match.", arg_directory);
+                        return -EINVAL;
+                }
+
+                arg_uid_range = UINT32_C(0x10000);
+        }
+
+        if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
+                log_error("UID base too high for UID range.");
+                return -EINVAL;
+        }
+
+        log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
+        return 0;
+}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
@@ -3337,6 +3697,8 @@ int main(int argc, char *argv[]) {
         pid_t pid = 0;
         int ret = EXIT_SUCCESS;
         union in_addr_union exposed = {};
+        _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
+        bool interactive;
 
         log_parse_environment();
         log_open();
@@ -3382,20 +3744,7 @@ int main(int argc, char *argv[]) {
                         goto finish;
                 }
 
-                if (arg_template) {
-                        r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
-                        if (r == -EEXIST) {
-                                if (!arg_quiet)
-                                        log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
-                        } else if (r < 0) {
-                                log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
-                                goto finish;
-                        } else {
-                                if (!arg_quiet)
-                                        log_info("Populated %s from template %s.", arg_directory, arg_template);
-                        }
-
-                } else if (arg_ephemeral) {
+                if (arg_ephemeral) {
                         char *np;
 
                         /* If the specified path is a mount point we
@@ -3418,6 +3767,12 @@ int main(int argc, char *argv[]) {
                                 goto finish;
                         }
 
+                        r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to lock %s: %m", np);
+                                goto finish;
+                        }
+
                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
                         if (r < 0) {
                                 free(np);
@@ -3429,6 +3784,31 @@ int main(int argc, char *argv[]) {
                         arg_directory = np;
 
                         remove_subvol = true;
+
+                } else {
+                        r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        if (r == -EBUSY) {
+                                log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
+                                goto finish;
+                        }
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to lock %s: %m", arg_directory);
+                                return r;
+                        }
+
+                        if (arg_template) {
+                                r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
+                                if (r == -EEXIST) {
+                                        if (!arg_quiet)
+                                                log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
+                                } else if (r < 0) {
+                                        log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
+                                        goto finish;
+                                } else {
+                                        if (!arg_quiet)
+                                                log_info("Populated %s from template %s.", arg_directory, arg_template);
+                                }
+                        }
                 }
 
                 if (arg_boot) {
@@ -3440,7 +3820,7 @@ int main(int argc, char *argv[]) {
                 } else {
                         const char *p;
 
-                        p = strappenda(arg_directory,
+                        p = strjoina(arg_directory,
                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
                         if (access(p, F_OK) < 0) {
                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
@@ -3455,6 +3835,16 @@ int main(int argc, char *argv[]) {
                 assert(arg_image);
                 assert(!arg_template);
 
+                r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                if (r == -EBUSY) {
+                        r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
+                        goto finish;
+                }
+                if (r < 0) {
+                        r = log_error_errno(r, "Failed to create image lock: %m");
+                        goto finish;
+                }
+
                 if (!mkdtemp(template)) {
                         log_error_errno(errno, "Failed to create temporary directory: %m");
                         r = -errno;
@@ -3482,6 +3872,12 @@ int main(int argc, char *argv[]) {
                         goto finish;
         }
 
+        r = determine_uid_shift();
+        if (r < 0)
+                goto finish;
+
+        interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
+
         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
         if (master < 0) {
                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
@@ -3494,15 +3890,15 @@ int main(int argc, char *argv[]) {
                 goto finish;
         }
 
-        if (!arg_quiet)
-                log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
-                         arg_machine, arg_image ?: arg_directory);
-
         if (unlockpt(master) < 0) {
                 r = log_error_errno(errno, "Failed to unlock tty: %m");
                 goto finish;
         }
 
+        if (!arg_quiet)
+                log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
+                         arg_machine, arg_image ?: arg_directory);
+
         assert_se(sigemptyset(&mask) == 0);
         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
@@ -3588,31 +3984,33 @@ int main(int argc, char *argv[]) {
 
                         master = safe_close(master);
 
-                        close_nointr(STDIN_FILENO);
-                        close_nointr(STDOUT_FILENO);
-                        close_nointr(STDERR_FILENO);
-
                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
 
                         reset_all_signal_handlers();
                         reset_signal_mask();
 
-                        r = open_terminal(console, O_RDWR);
-                        if (r != STDIN_FILENO) {
-                                if (r >= 0) {
-                                        safe_close(r);
-                                        r = -EINVAL;
-                                }
+                        if (interactive) {
+                                close_nointr(STDIN_FILENO);
+                                close_nointr(STDOUT_FILENO);
+                                close_nointr(STDERR_FILENO);
 
-                                log_error_errno(r, "Failed to open console: %m");
-                                _exit(EXIT_FAILURE);
-                        }
+                                r = open_terminal(console, O_RDWR);
+                                if (r != STDIN_FILENO) {
+                                        if (r >= 0) {
+                                                safe_close(r);
+                                                r = -EINVAL;
+                                        }
 
-                        if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
-                            dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
-                                log_error_errno(errno, "Failed to duplicate console: %m");
-                                _exit(EXIT_FAILURE);
+                                        log_error_errno(r, "Failed to open console: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
+
+                                if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
+                                    dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
+                                        log_error_errno(errno, "Failed to duplicate console: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
                         }
 
                         if (setsid() < 0) {
@@ -3628,6 +4026,9 @@ int main(int argc, char *argv[]) {
                                 _exit(EXIT_FAILURE);
                         }
 
+                        if (arg_private_network)
+                                loopback_setup();
+
                         /* Mark everything as slave, so that we still
                          * receive mounts from the real root, but don't
                          * propagate mounts to the real root. */
@@ -3698,7 +4099,7 @@ int main(int argc, char *argv[]) {
                         /* Tell the parent that we are ready, and that
                          * it can cgroupify us to that we lack access
                          * to certain devices and resources. */
-                        (void) barrier_place(&barrier);
+                        (void) barrier_place(&barrier); /* #1 */
 
                         if (setup_boot_id(arg_directory) < 0)
                                 _exit(EXIT_FAILURE);
@@ -3723,7 +4124,7 @@ int main(int argc, char *argv[]) {
 
                         /* Wait until we are cgroup-ified, so that we
                          * can mount the right cgroup path writable */
-                        (void) barrier_sync_next(&barrier);
+                        (void) barrier_place_and_sync(&barrier); /* #2 */
 
                         if (mount_cgroup(arg_directory) < 0)
                                 _exit(EXIT_FAILURE);
@@ -3748,16 +4149,50 @@ int main(int argc, char *argv[]) {
                                 _exit(EXIT_FAILURE);
                         }
 
-                        umask(0022);
+                        if (arg_userns) {
+                                if (unshare(CLONE_NEWUSER) < 0) {
+                                        log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
 
-                        if (arg_private_network)
-                                loopback_setup();
+                                /* Tell the parent, that it now can
+                                 * write the UID map. */
+                                (void) barrier_place(&barrier); /* #3 */
+
+                                /* Wait until the parent wrote the UID
+                                 * map */
+                                (void) barrier_place_and_sync(&barrier); /* #4 */
+                        }
+
+                        umask(0022);
 
                         if (drop_capabilities() < 0) {
                                 log_error_errno(errno, "drop_capabilities() failed: %m");
                                 _exit(EXIT_FAILURE);
                         }
 
+                        setup_hostname();
+
+                        if (arg_personality != 0xffffffffLU) {
+                                if (personality(arg_personality) < 0) {
+                                        log_error_errno(errno, "personality() failed: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
+                        } else if (secondary) {
+                                if (personality(PER_LINUX32) < 0) {
+                                        log_error_errno(errno, "personality() failed: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
+                        }
+
+#ifdef HAVE_SELINUX
+                        if (arg_selinux_context)
+                                if (setexeccon((security_context_t) arg_selinux_context) < 0) {
+                                        log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
+                                        _exit(EXIT_FAILURE);
+                                }
+#endif
+
                         r = change_uid_gid(&home);
                         if (r < 0)
                                 _exit(EXIT_FAILURE);
@@ -3792,28 +4227,6 @@ int main(int argc, char *argv[]) {
                                 }
                         }
 
-                        setup_hostname();
-
-                        if (arg_personality != 0xffffffffLU) {
-                                if (personality(arg_personality) < 0) {
-                                        log_error_errno(errno, "personality() failed: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-                        } else if (secondary) {
-                                if (personality(PER_LINUX32) < 0) {
-                                        log_error_errno(errno, "personality() failed: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-                        }
-
-#ifdef HAVE_SELINUX
-                        if (arg_selinux_context)
-                                if (setexeccon((security_context_t) arg_selinux_context) < 0) {
-                                        log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
-                                        _exit(EXIT_FAILURE);
-                                }
-#endif
-
                         if (!strv_isempty(arg_setenv)) {
                                 char **n;
 
@@ -3827,9 +4240,10 @@ int main(int argc, char *argv[]) {
                         } else
                                 env_use = (char**) envp;
 
-                        /* Wait until the parent is ready with the setup, too... */
-                        if (!barrier_place_and_sync(&barrier))
-                                _exit(EXIT_FAILURE);
+                        /* Let the parent know that we are ready and
+                         * wait until the parent is ready with the
+                         * setup, too... */
+                        (void) barrier_place_and_sync(&barrier); /* #5 */
 
                         if (arg_boot) {
                                 char **a;
@@ -3868,10 +4282,12 @@ int main(int argc, char *argv[]) {
                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
 
+                (void) barrier_place(&barrier); /* #1 */
+
                 /* Wait for the most basic Child-setup to be done,
                  * before we add hardware to it, and place it in a
                  * cgroup. */
-                if (barrier_sync_next(&barrier)) {
+                if (barrier_sync(&barrier)) { /* #1 */
                         int ifi = 0;
 
                         r = move_network_interfaces(pid);
@@ -3890,10 +4306,43 @@ int main(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
 
+                        r = setup_ipvlan(pid);
+                        if (r < 0)
+                                goto finish;
+
                         r = register_machine(pid, ifi);
                         if (r < 0)
                                 goto finish;
 
+                        /* Notify the child that the parent is ready with all
+                         * its setup, and that the child can now hand over
+                         * control to the code to run inside the container. */
+                        (void) barrier_place(&barrier); /* #2 */
+
+                        if (arg_userns) {
+                                char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+
+                                (void) barrier_place_and_sync(&barrier); /* #3 */
+
+                                xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
+                                xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
+                                r = write_string_file(uid_map, line);
+                                if (r < 0) {
+                                        log_error_errno(r, "Failed to write UID map: %m");
+                                        goto finish;
+                                }
+
+                                /* We always assign the same UID and GID ranges */
+                                xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
+                                r = write_string_file(uid_map, line);
+                                if (r < 0) {
+                                        log_error_errno(r, "Failed to write GID map: %m");
+                                        goto finish;
+                                }
+
+                                (void) barrier_place(&barrier); /* #4 */
+                        }
+
                         /* Block SIGCHLD here, before notifying child.
                          * process_pty() will handle it with the other signals. */
                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
@@ -3905,21 +4354,17 @@ int main(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
 
-                        /* Notify the child that the parent is ready with all
-                         * its setup, and that the child can now hand over
-                         * control to the code to run inside the container. */
-                        (void) barrier_place(&barrier);
-
-                        /* And wait that the child is completely ready now. */
-                        if (barrier_place_and_sync(&barrier)) {
+                        /* Let the child know that we are ready and wait that the child is completely ready now. */
+                        if (barrier_place_and_sync(&barrier)) { /* #5 */
                                 _cleanup_event_unref_ sd_event *event = NULL;
                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
                                 char last_char = 0;
 
-                                sd_notify(false,
-                                          "READY=1\n"
-                                          "STATUS=Container running.");
+                                sd_notifyf(false,
+                                           "READY=1\n"
+                                           "STATUS=Container running.\n"
+                                           "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
 
                                 r = sd_event_new(&event);
                                 if (r < 0) {
@@ -3927,7 +4372,7 @@ int main(int argc, char *argv[]) {
                                         goto finish;
                                 }
 
-                                if (arg_boot) {
+                                if (arg_kill_signal > 0) {
                                         /* Try to kill the init system on SIGINT or SIGTERM */
                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
@@ -3950,7 +4395,7 @@ int main(int argc, char *argv[]) {
 
                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
 
-                                r = pty_forward_new(event, master, true, &forward);
+                                r = pty_forward_new(event, master, true, !interactive, &forward);
                                 if (r < 0) {
                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
                                         goto finish;
@@ -4034,7 +4479,7 @@ finish:
         if (arg_machine) {
                 const char *p;
 
-                p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
+                p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
                 (void) rm_rf(p, false, true, false);
         }
 
@@ -4046,6 +4491,7 @@ finish:
         strv_free(arg_setenv);
         strv_free(arg_network_interfaces);
         strv_free(arg_network_macvlan);
+        strv_free(arg_network_ipvlan);
         strv_free(arg_bind);
         strv_free(arg_bind_ro);
         strv_free(arg_tmpfs);