chiark / gitweb /
Remove src/notify
[elogind.git] / src / nspawn / nspawn.c
index a9b9a3e062f7a6d0abd78660ab5c3e22bd3996b6..7e56cf2056c72c03ea0344e7129282d6ebfdfbae 100644 (file)
 #include <sched.h>
 #include <unistd.h>
 #include <sys/types.h>
-#include <sys/syscall.h>
 #include <sys/mount.h>
-#include <sys/wait.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <errno.h>
 #include <sys/prctl.h>
 #include <getopt.h>
-#include <termios.h>
-#include <sys/signalfd.h>
 #include <grp.h>
 #include <linux/fs.h>
-#include <sys/un.h>
 #include <sys/socket.h>
 #include <linux/netlink.h>
 #include <net/if.h>
 #include <linux/veth.h>
 #include <sys/personality.h>
 #include <linux/loop.h>
-#include <poll.h>
 #include <sys/file.h>
 
 #ifdef HAVE_SELINUX
@@ -66,7 +60,6 @@
 #include "util.h"
 #include "mkdir.h"
 #include "macro.h"
-#include "audit.h"
 #include "missing.h"
 #include "cgroup-util.h"
 #include "strv.h"
@@ -79,9 +72,7 @@
 #include "bus-util.h"
 #include "bus-error.h"
 #include "ptyfwd.h"
-#include "bus-kernel.h"
 #include "env-util.h"
-#include "def.h"
 #include "rtnl-util.h"
 #include "udev-util.h"
 #include "blkid-util.h"
@@ -190,6 +181,7 @@ static ExposePort *arg_expose_ports = NULL;
 static char **arg_property = NULL;
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static bool arg_userns = false;
+static int arg_kill_signal = 0;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -238,6 +230,7 @@ static void help(void) {
                "     --capability=CAP       In addition to the default, retain specified\n"
                "                            capability\n"
                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
+               "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
                "                            try-guest, try-host\n"
                "  -j                        Equivalent to --link-journal=try-guest\n"
@@ -302,6 +295,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_TEMPLATE,
                 ARG_PROPERTY,
                 ARG_PRIVATE_USERS,
+                ARG_KILL_SIGNAL,
         };
 
         static const struct option options[] = {
@@ -341,6 +335,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "port",                  required_argument, NULL, 'p'                   },
                 { "property",              required_argument, NULL, ARG_PROPERTY          },
                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
+                { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL       },
                 {}
         };
 
@@ -776,6 +771,15 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_userns = true;
                         break;
 
+                case ARG_KILL_SIGNAL:
+                        arg_kill_signal = signal_from_string_try_harder(optarg);
+                        if (arg_kill_signal < 0) {
+                                log_error("Cannot parse signal: %s", optarg);
+                                return -EINVAL;
+                        }
+
+                        break;
+
                 case '?':
                         return -EINVAL;
 
@@ -838,6 +842,9 @@ static int parse_argv(int argc, char *argv[]) {
 
         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
 
+        if (arg_boot && arg_kill_signal <= 0)
+                arg_kill_signal = SIGRTMIN+3;
+
         return 1;
 }
 
@@ -1000,7 +1007,7 @@ static int mount_binds(const char *dest, char **l, bool ro) {
                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
                 }
 
-                if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
+                if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
                         return log_error_errno(errno, "mount(%s) failed: %m", where);
 
                 if (ro) {
@@ -1316,7 +1323,7 @@ static int setup_volatile(const char *directory) {
                 goto fail;
         }
 
-        if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
+        if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
                 r = -errno;
                 goto fail;
@@ -1387,10 +1394,10 @@ static int setup_boot_id(const char *dest) {
         if (r < 0)
                 return log_error_errno(r, "Failed to write boot id: %m");
 
-        if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
+        if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
                 log_error_errno(errno, "Failed to bind mount boot id: %m");
                 r = -errno;
-        } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
+        } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
 
         unlink(from);
@@ -1442,8 +1449,18 @@ static int copy_devnodes(const char *dest) {
                                 return -r;
                         }
 
-                        if (mknod(to, st.st_mode, st.st_rdev) < 0)
-                                return log_error_errno(errno, "mknod(%s) failed: %m", to);
+                        if (mknod(to, st.st_mode, st.st_rdev) < 0) {
+                                if (errno != EPERM)
+                                        return log_error_errno(errno, "mknod(%s) failed: %m", to);
+
+                                /* Some systems abusively restrict mknod but
+                                 * allow bind mounts. */
+                                r = touch(to);
+                                if (r < 0)
+                                        return log_error_errno(r, "touch (%s) failed: %m", to);
+                                if (mount(from, to, NULL, MS_BIND, NULL) < 0)
+                                        return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
+                        }
 
                         if (arg_userns && arg_uid_shift != UID_INVALID)
                                 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
@@ -1474,7 +1491,6 @@ static int setup_ptmx(const char *dest) {
 static int setup_dev_console(const char *dest, const char *console) {
         _cleanup_umask_ mode_t u;
         const char *to;
-        struct stat st;
         int r;
 
         assert(dest);
@@ -1482,26 +1498,20 @@ static int setup_dev_console(const char *dest, const char *console) {
 
         u = umask(0000);
 
-        if (stat("/dev/null", &st) < 0)
-                return log_error_errno(errno, "Failed to stat /dev/null: %m");
-
         r = chmod_and_chown(console, 0600, 0, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
 
         /* We need to bind mount the right tty to /dev/console since
          * ptys can only exist on pts file systems. To have something
-         * to bind mount things on we create a device node first, and
-         * use /dev/null for that since we the cgroups device policy
-         * allows us to create that freely, while we cannot create
-         * /dev/console. (Note that the major minor doesn't actually
-         * matter here, since we mount it over anyway). */
+         * to bind mount things on we create a empty regular file. */
 
         to = strjoina(dest, "/dev/console");
-        if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
-                return log_error_errno(errno, "mknod() for /dev/console failed: %m");
+        r = touch(to);
+        if (r < 0)
+                return log_error_errno(r, "touch() for /dev/console failed: %m");
 
-        if (mount(console, to, "bind", MS_BIND, NULL) < 0)
+        if (mount(console, to, NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
 
         return 0;
@@ -1544,7 +1554,7 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
         if (r < 0)
                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
 
-        if (mount(from, to, "bind", MS_BIND, NULL) < 0)
+        if (mount(from, to, NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
 
         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
@@ -1919,7 +1929,7 @@ static int setup_journal(const char *directory) {
                 return r;
         }
 
-        if (mount(p, q, "bind", MS_BIND, NULL) < 0)
+        if (mount(p, q, NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
 
         return 0;
@@ -2560,19 +2570,19 @@ static int setup_ipvlan(pid_t pid) {
 static int setup_seccomp(void) {
 
 #ifdef HAVE_SECCOMP
-        static const int blacklist[] = {
-                SCMP_SYS(kexec_load),
-                SCMP_SYS(open_by_handle_at),
-                SCMP_SYS(iopl),
-                SCMP_SYS(ioperm),
-                SCMP_SYS(swapon),
-                SCMP_SYS(swapoff),
-        };
-
-        static const int kmod_blacklist[] = {
-                SCMP_SYS(init_module),
-                SCMP_SYS(finit_module),
-                SCMP_SYS(delete_module),
+        static const struct {
+                uint64_t capability;
+                int syscall_num;
+        } blacklist[] = {
+                { CAP_SYS_RAWIO,  SCMP_SYS(iopl)},
+                { CAP_SYS_RAWIO,  SCMP_SYS(ioperm)},
+                { CAP_SYS_BOOT,   SCMP_SYS(kexec_load)},
+                { CAP_SYS_ADMIN,  SCMP_SYS(swapon)},
+                { CAP_SYS_ADMIN,  SCMP_SYS(swapoff)},
+                { CAP_SYS_ADMIN,  SCMP_SYS(open_by_handle_at)},
+                { CAP_SYS_MODULE, SCMP_SYS(init_module)},
+                { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
+                { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
         };
 
         scmp_filter_ctx seccomp;
@@ -2590,7 +2600,10 @@ static int setup_seccomp(void) {
         }
 
         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
-                r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
+                if (arg_retain & (1ULL << blacklist[i].capability))
+                        continue;
+
+                r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
                 if (r == -EFAULT)
                         continue; /* unknown syscall */
                 if (r < 0) {
@@ -2599,19 +2612,6 @@ static int setup_seccomp(void) {
                 }
         }
 
-        /* If the CAP_SYS_MODULE capability is not requested then
-         * we'll block the kmod syscalls too */
-        if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
-                for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
-                        r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
-                        if (r == -EFAULT)
-                                continue; /* unknown syscall */
-                        if (r < 0) {
-                                log_error_errno(r, "Failed to block syscall: %m");
-                                goto finish;
-                        }
-                }
-        }
 
         /*
            Audit is broken in containers, much of the userspace audit
@@ -2826,7 +2826,7 @@ static int dissect_image(
                 return -errno;
         }
 
-        blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
+        (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
 
         is_gpt = streq_ptr(pttype, "gpt");
         is_mbr = streq_ptr(pttype, "dos");
@@ -3128,7 +3128,7 @@ static int dissect_image(
         return 0;
 #else
         log_error("--image= is not supported, compiled without blkid support.");
-        return -ENOTSUP;
+        return -EOPNOTSUPP;
 #endif
 }
 
@@ -3183,7 +3183,7 @@ static int mount_device(const char *what, const char *where, const char *directo
 
         if (streq(fstype, "crypto_LUKS")) {
                 log_error("nspawn currently does not support LUKS disk images.");
-                return -ENOTSUP;
+                return -EOPNOTSUPP;
         }
 
         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
@@ -3192,7 +3192,7 @@ static int mount_device(const char *what, const char *where, const char *directo
         return 0;
 #else
         log_error("--image= is not supported, compiled without blkid support.");
-        return -ENOTSUP;
+        return -EOPNOTSUPP;
 #endif
 }
 
@@ -3568,7 +3568,7 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo
 
         pid = PTR_TO_UINT32(userdata);
         if (pid > 0) {
-                if (kill(pid, SIGRTMIN+3) >= 0) {
+                if (kill(pid, arg_kill_signal) >= 0) {
                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
                         sd_event_source_set_userdata(s, NULL);
                         return 0;
@@ -3710,12 +3710,6 @@ int main(int argc, char *argv[]) {
                 goto finish;
         }
 
-        if (sd_booted() <= 0) {
-                log_error("Not running on a systemd system.");
-                r = -EINVAL;
-                goto finish;
-        }
-
         log_close();
         n_fd_passed = sd_listen_fds(false);
         if (n_fd_passed > 0) {
@@ -3738,7 +3732,7 @@ int main(int argc, char *argv[]) {
                 }
 
                 if (arg_ephemeral) {
-                        char *np;
+                        _cleanup_free_ char *np = NULL;
 
                         /* If the specified path is a mount point we
                          * generate the new snapshot immediately
@@ -3768,13 +3762,13 @@ int main(int argc, char *argv[]) {
 
                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
                         if (r < 0) {
-                                free(np);
                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
                                 goto finish;
                         }
 
                         free(arg_directory);
                         arg_directory = np;
+                        np = NULL;
 
                         remove_subvol = true;
 
@@ -4037,7 +4031,7 @@ int main(int argc, char *argv[]) {
                                 _exit(EXIT_FAILURE);
 
                         /* Turn directory into bind mount */
-                        if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
+                        if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
                                 log_error_errno(errno, "Failed to make bind mount: %m");
                                 _exit(EXIT_FAILURE);
                         }
@@ -4365,7 +4359,7 @@ int main(int argc, char *argv[]) {
                                         goto finish;
                                 }
 
-                                if (arg_boot) {
+                                if (arg_kill_signal > 0) {
                                         /* Try to kill the init system on SIGINT or SIGTERM */
                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));