chiark / gitweb /
nspawn: make kill signal to use for PID 1 configurable
[elogind.git] / src / nspawn / nspawn.c
index 232629d20ad80c9e4b95f894edf730bf5017ebff..8ce5fbeb629f6b25db1bd7cd0c297bf21c0e5067 100644 (file)
 #include <sched.h>
 #include <unistd.h>
 #include <sys/types.h>
-#include <sys/syscall.h>
 #include <sys/mount.h>
-#include <sys/wait.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <errno.h>
 #include <sys/prctl.h>
 #include <getopt.h>
-#include <termios.h>
-#include <sys/signalfd.h>
 #include <grp.h>
 #include <linux/fs.h>
-#include <sys/un.h>
 #include <sys/socket.h>
 #include <linux/netlink.h>
 #include <net/if.h>
 #include <linux/veth.h>
 #include <sys/personality.h>
 #include <linux/loop.h>
-#include <poll.h>
 #include <sys/file.h>
 
 #ifdef HAVE_SELINUX
@@ -66,7 +60,6 @@
 #include "util.h"
 #include "mkdir.h"
 #include "macro.h"
-#include "audit.h"
 #include "missing.h"
 #include "cgroup-util.h"
 #include "strv.h"
@@ -79,9 +72,7 @@
 #include "bus-util.h"
 #include "bus-error.h"
 #include "ptyfwd.h"
-#include "bus-kernel.h"
 #include "env-util.h"
-#include "def.h"
 #include "rtnl-util.h"
 #include "udev-util.h"
 #include "blkid-util.h"
@@ -188,6 +179,9 @@ static char *arg_image = NULL;
 static Volatile arg_volatile = VOLATILE_NO;
 static ExposePort *arg_expose_ports = NULL;
 static char **arg_property = NULL;
+static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
+static bool arg_userns = false;
+static int arg_kill_signal = 0;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -223,6 +217,8 @@ static void help(void) {
                "                            Add a virtual ethernet connection between host\n"
                "                            and container and add it to an existing bridge on\n"
                "                            the host\n"
+               "     --private-users[=UIDBASE[:NUIDS]]\n"
+               "                            Run within user namespace\n"
                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
                "                            Expose a container IP port on the host\n"
                "  -Z --selinux-context=SECLABEL\n"
@@ -234,6 +230,7 @@ static void help(void) {
                "     --capability=CAP       In addition to the default, retain specified\n"
                "                            capability\n"
                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
+               "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
                "                            try-guest, try-host\n"
                "  -j                        Equivalent to --link-journal=try-guest\n"
@@ -297,6 +294,8 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_VOLATILE,
                 ARG_TEMPLATE,
                 ARG_PROPERTY,
+                ARG_PRIVATE_USERS,
+                ARG_KILL_SIGNAL,
         };
 
         static const struct option options[] = {
@@ -335,6 +334,8 @@ static int parse_argv(int argc, char *argv[]) {
                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
                 { "port",                  required_argument, NULL, 'p'                   },
                 { "property",              required_argument, NULL, ARG_PROPERTY          },
+                { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
+                { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL       },
                 {}
         };
 
@@ -741,6 +742,44 @@ static int parse_argv(int argc, char *argv[]) {
 
                         break;
 
+                case ARG_PRIVATE_USERS:
+                        if (optarg) {
+                                _cleanup_free_ char *buffer = NULL;
+                                const char *range, *shift;
+
+                                range = strchr(optarg, ':');
+                                if (range) {
+                                        buffer = strndup(optarg, range - optarg);
+                                        if (!buffer)
+                                                return log_oom();
+                                        shift = buffer;
+
+                                        range++;
+                                        if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
+                                                log_error("Failed to parse UID range: %s", range);
+                                                return -EINVAL;
+                                        }
+                                } else
+                                        shift = optarg;
+
+                                if (parse_uid(shift, &arg_uid_shift) < 0) {
+                                        log_error("Failed to parse UID: %s", optarg);
+                                        return -EINVAL;
+                                }
+                        }
+
+                        arg_userns = true;
+                        break;
+
+                case ARG_KILL_SIGNAL:
+                        arg_kill_signal = signal_from_string_try_harder(optarg);
+                        if (arg_kill_signal < 0) {
+                                log_error("Cannot parse signal: %s", optarg);
+                                return -EINVAL;
+                        }
+
+                        break;
+
                 case '?':
                         return -EINVAL;
 
@@ -803,6 +842,9 @@ static int parse_argv(int argc, char *argv[]) {
 
         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
 
+        if (arg_boot && arg_kill_signal <= 0)
+                arg_kill_signal = SIGRTMIN+3;
+
         return 1;
 }
 
@@ -837,10 +879,7 @@ static int mount_all(const char *dest) {
         int r = 0;
 
         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
-                _cleanup_free_ char *where = NULL;
-#ifdef HAVE_SELINUX
-                _cleanup_free_ char *options = NULL;
-#endif
+                _cleanup_free_ char *where = NULL, *options = NULL;
                 const char *o;
                 int t;
 
@@ -887,6 +926,19 @@ static int mount_all(const char *dest) {
 #endif
                         o = mount_table[k].options;
 
+                if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
+                        char *uid_options = NULL;
+
+                        if (o)
+                                asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
+                        else
+                                asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
+                        if (!uid_options)
+                                return log_oom();
+
+                        free(options);
+                        o = options = uid_options;
+                }
 
                 if (mount(mount_table[k].what,
                           where,
@@ -1399,6 +1451,10 @@ static int copy_devnodes(const char *dest) {
 
                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
+
+                        if (arg_userns && arg_uid_shift != UID_INVALID)
+                                if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
+                                        return log_error_errno(errno, "chown() of device node %s failed: %m", to);
                 }
         }
 
@@ -1415,6 +1471,10 @@ static int setup_ptmx(const char *dest) {
         if (symlink("pts/ptmx", p) < 0)
                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
 
+        if (arg_userns && arg_uid_shift != UID_INVALID)
+                if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
+                        return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
+
         return 0;
 }
 
@@ -2701,7 +2761,7 @@ static int setup_image(char **device_path, int *loop_nr) {
 
 #define PARTITION_TABLE_BLURB \
         "Note that the disk image needs to either contain only a single MBR partition of\n" \
-        "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
+        "type 0x83 that is marked bootable, or a single GPT partition of type " \
         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
         "to be bootable with systemd-nspawn."
@@ -3515,7 +3575,7 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo
 
         pid = PTR_TO_UINT32(userdata);
         if (pid > 0) {
-                if (kill(pid, SIGRTMIN+3) >= 0) {
+                if (kill(pid, arg_kill_signal) >= 0) {
                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
                         sd_event_source_set_userdata(s, NULL);
                         return 0;
@@ -3592,6 +3652,38 @@ static int determine_names(void) {
         return 0;
 }
 
+static int determine_uid_shift(void) {
+        int r;
+
+        if (!arg_userns)
+                return 0;
+
+        if (arg_uid_shift == UID_INVALID) {
+                struct stat st;
+
+                r = stat(arg_directory, &st);
+                if (r < 0)
+                        return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
+
+                arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
+
+                if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
+                        log_error("UID and GID base of %s don't match.", arg_directory);
+                        return -EINVAL;
+                }
+
+                arg_uid_range = UINT32_C(0x10000);
+        }
+
+        if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
+                log_error("UID base too high for UID range.");
+                return -EINVAL;
+        }
+
+        log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
+        return 0;
+}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
@@ -3606,6 +3698,7 @@ int main(int argc, char *argv[]) {
         int ret = EXIT_SUCCESS;
         union in_addr_union exposed = {};
         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
+        bool interactive;
 
         log_parse_environment();
         log_open();
@@ -3779,6 +3872,12 @@ int main(int argc, char *argv[]) {
                         goto finish;
         }
 
+        r = determine_uid_shift();
+        if (r < 0)
+                goto finish;
+
+        interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
+
         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
         if (master < 0) {
                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
@@ -3791,15 +3890,15 @@ int main(int argc, char *argv[]) {
                 goto finish;
         }
 
-        if (!arg_quiet)
-                log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
-                         arg_machine, arg_image ?: arg_directory);
-
         if (unlockpt(master) < 0) {
                 r = log_error_errno(errno, "Failed to unlock tty: %m");
                 goto finish;
         }
 
+        if (!arg_quiet)
+                log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
+                         arg_machine, arg_image ?: arg_directory);
+
         assert_se(sigemptyset(&mask) == 0);
         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
@@ -3885,31 +3984,33 @@ int main(int argc, char *argv[]) {
 
                         master = safe_close(master);
 
-                        close_nointr(STDIN_FILENO);
-                        close_nointr(STDOUT_FILENO);
-                        close_nointr(STDERR_FILENO);
-
                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
 
                         reset_all_signal_handlers();
                         reset_signal_mask();
 
-                        r = open_terminal(console, O_RDWR);
-                        if (r != STDIN_FILENO) {
-                                if (r >= 0) {
-                                        safe_close(r);
-                                        r = -EINVAL;
-                                }
+                        if (interactive) {
+                                close_nointr(STDIN_FILENO);
+                                close_nointr(STDOUT_FILENO);
+                                close_nointr(STDERR_FILENO);
 
-                                log_error_errno(r, "Failed to open console: %m");
-                                _exit(EXIT_FAILURE);
-                        }
+                                r = open_terminal(console, O_RDWR);
+                                if (r != STDIN_FILENO) {
+                                        if (r >= 0) {
+                                                safe_close(r);
+                                                r = -EINVAL;
+                                        }
 
-                        if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
-                            dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
-                                log_error_errno(errno, "Failed to duplicate console: %m");
-                                _exit(EXIT_FAILURE);
+                                        log_error_errno(r, "Failed to open console: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
+
+                                if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
+                                    dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
+                                        log_error_errno(errno, "Failed to duplicate console: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
                         }
 
                         if (setsid() < 0) {
@@ -3925,6 +4026,9 @@ int main(int argc, char *argv[]) {
                                 _exit(EXIT_FAILURE);
                         }
 
+                        if (arg_private_network)
+                                loopback_setup();
+
                         /* Mark everything as slave, so that we still
                          * receive mounts from the real root, but don't
                          * propagate mounts to the real root. */
@@ -3995,7 +4099,7 @@ int main(int argc, char *argv[]) {
                         /* Tell the parent that we are ready, and that
                          * it can cgroupify us to that we lack access
                          * to certain devices and resources. */
-                        (void) barrier_place(&barrier);
+                        (void) barrier_place(&barrier); /* #1 */
 
                         if (setup_boot_id(arg_directory) < 0)
                                 _exit(EXIT_FAILURE);
@@ -4020,7 +4124,7 @@ int main(int argc, char *argv[]) {
 
                         /* Wait until we are cgroup-ified, so that we
                          * can mount the right cgroup path writable */
-                        (void) barrier_sync_next(&barrier);
+                        (void) barrier_place_and_sync(&barrier); /* #2 */
 
                         if (mount_cgroup(arg_directory) < 0)
                                 _exit(EXIT_FAILURE);
@@ -4045,16 +4149,50 @@ int main(int argc, char *argv[]) {
                                 _exit(EXIT_FAILURE);
                         }
 
-                        umask(0022);
+                        if (arg_userns) {
+                                if (unshare(CLONE_NEWUSER) < 0) {
+                                        log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
 
-                        if (arg_private_network)
-                                loopback_setup();
+                                /* Tell the parent, that it now can
+                                 * write the UID map. */
+                                (void) barrier_place(&barrier); /* #3 */
+
+                                /* Wait until the parent wrote the UID
+                                 * map */
+                                (void) barrier_place_and_sync(&barrier); /* #4 */
+                        }
+
+                        umask(0022);
 
                         if (drop_capabilities() < 0) {
                                 log_error_errno(errno, "drop_capabilities() failed: %m");
                                 _exit(EXIT_FAILURE);
                         }
 
+                        setup_hostname();
+
+                        if (arg_personality != 0xffffffffLU) {
+                                if (personality(arg_personality) < 0) {
+                                        log_error_errno(errno, "personality() failed: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
+                        } else if (secondary) {
+                                if (personality(PER_LINUX32) < 0) {
+                                        log_error_errno(errno, "personality() failed: %m");
+                                        _exit(EXIT_FAILURE);
+                                }
+                        }
+
+#ifdef HAVE_SELINUX
+                        if (arg_selinux_context)
+                                if (setexeccon((security_context_t) arg_selinux_context) < 0) {
+                                        log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
+                                        _exit(EXIT_FAILURE);
+                                }
+#endif
+
                         r = change_uid_gid(&home);
                         if (r < 0)
                                 _exit(EXIT_FAILURE);
@@ -4089,28 +4227,6 @@ int main(int argc, char *argv[]) {
                                 }
                         }
 
-                        setup_hostname();
-
-                        if (arg_personality != 0xffffffffLU) {
-                                if (personality(arg_personality) < 0) {
-                                        log_error_errno(errno, "personality() failed: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-                        } else if (secondary) {
-                                if (personality(PER_LINUX32) < 0) {
-                                        log_error_errno(errno, "personality() failed: %m");
-                                        _exit(EXIT_FAILURE);
-                                }
-                        }
-
-#ifdef HAVE_SELINUX
-                        if (arg_selinux_context)
-                                if (setexeccon((security_context_t) arg_selinux_context) < 0) {
-                                        log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
-                                        _exit(EXIT_FAILURE);
-                                }
-#endif
-
                         if (!strv_isempty(arg_setenv)) {
                                 char **n;
 
@@ -4124,9 +4240,10 @@ int main(int argc, char *argv[]) {
                         } else
                                 env_use = (char**) envp;
 
-                        /* Wait until the parent is ready with the setup, too... */
-                        if (!barrier_place_and_sync(&barrier))
-                                _exit(EXIT_FAILURE);
+                        /* Let the parent know that we are ready and
+                         * wait until the parent is ready with the
+                         * setup, too... */
+                        (void) barrier_place_and_sync(&barrier); /* #5 */
 
                         if (arg_boot) {
                                 char **a;
@@ -4165,10 +4282,12 @@ int main(int argc, char *argv[]) {
                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
 
+                (void) barrier_place(&barrier); /* #1 */
+
                 /* Wait for the most basic Child-setup to be done,
                  * before we add hardware to it, and place it in a
                  * cgroup. */
-                if (barrier_sync_next(&barrier)) {
+                if (barrier_sync(&barrier)) { /* #1 */
                         int ifi = 0;
 
                         r = move_network_interfaces(pid);
@@ -4195,6 +4314,35 @@ int main(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
 
+                        /* Notify the child that the parent is ready with all
+                         * its setup, and that the child can now hand over
+                         * control to the code to run inside the container. */
+                        (void) barrier_place(&barrier); /* #2 */
+
+                        if (arg_userns) {
+                                char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+
+                                (void) barrier_place_and_sync(&barrier); /* #3 */
+
+                                xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
+                                xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
+                                r = write_string_file(uid_map, line);
+                                if (r < 0) {
+                                        log_error_errno(r, "Failed to write UID map: %m");
+                                        goto finish;
+                                }
+
+                                /* We always assign the same UID and GID ranges */
+                                xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
+                                r = write_string_file(uid_map, line);
+                                if (r < 0) {
+                                        log_error_errno(r, "Failed to write GID map: %m");
+                                        goto finish;
+                                }
+
+                                (void) barrier_place(&barrier); /* #4 */
+                        }
+
                         /* Block SIGCHLD here, before notifying child.
                          * process_pty() will handle it with the other signals. */
                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
@@ -4206,13 +4354,8 @@ int main(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
 
-                        /* Notify the child that the parent is ready with all
-                         * its setup, and that the child can now hand over
-                         * control to the code to run inside the container. */
-                        (void) barrier_place(&barrier);
-
-                        /* And wait that the child is completely ready now. */
-                        if (barrier_place_and_sync(&barrier)) {
+                        /* Let the child know that we are ready and wait that the child is completely ready now. */
+                        if (barrier_place_and_sync(&barrier)) { /* #5 */
                                 _cleanup_event_unref_ sd_event *event = NULL;
                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
@@ -4229,7 +4372,7 @@ int main(int argc, char *argv[]) {
                                         goto finish;
                                 }
 
-                                if (arg_boot) {
+                                if (arg_kill_signal > 0) {
                                         /* Try to kill the init system on SIGINT or SIGTERM */
                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
@@ -4252,7 +4395,7 @@ int main(int argc, char *argv[]) {
 
                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
 
-                                r = pty_forward_new(event, master, true, &forward);
+                                r = pty_forward_new(event, master, true, !interactive, &forward);
                                 if (r < 0) {
                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
                                         goto finish;