chiark / gitweb /
execute: support syscall filtering using seccomp filters
authorLennart Poettering <lennart@poettering.net>
Tue, 17 Jul 2012 02:17:53 +0000 (04:17 +0200)
committerLennart Poettering <lennart@poettering.net>
Tue, 17 Jul 2012 02:17:53 +0000 (04:17 +0200)
19 files changed:
Makefile.am
TODO
man/systemd.exec.xml
src/core/.gitignore
src/core/dbus-execute.c
src/core/dbus-execute.h
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4
src/core/load-fragment.c
src/core/load-fragment.h
src/core/syscall-list.c [new file with mode: 0644]
src/core/syscall-list.h [new file with mode: 0644]
src/shared/dbus-common.c
src/shared/exit-status.c
src/shared/exit-status.h
src/shared/linux/seccomp-bpf.h [new file with mode: 0644]
src/shared/linux/seccomp.h [new file with mode: 0644]
src/shared/missing.h

index cf911a0..b16c01a 100644 (file)
@@ -961,11 +961,15 @@ libsystemd_core_la_SOURCES = \
        src/core/switch-root.h \
        src/core/switch-root.c \
        src/core/killall.h \
-       src/core/killall.c
+       src/core/killall.c \
+       src/core/syscall-list.c \
+       src/core/syscall-list.h
 
 nodist_libsystemd_core_la_SOURCES = \
        src/core/load-fragment-gperf.c \
-       src/core/load-fragment-gperf-nulstr.c
+       src/core/load-fragment-gperf-nulstr.c \
+       src/core/syscall-from-name.h \
+       src/core/syscall-to-name.h
 
 libsystemd_core_la_CFLAGS = \
        $(AM_CFLAGS) \
@@ -998,7 +1002,23 @@ EXTRA_DIST += \
 CLEANFILES += \
        src/core/load-fragment-gperf.gperf \
        src/core/load-fragment-gperf.c \
-       src/core/load-fragment-gperf-nulstr.c
+       src/core/load-fragment-gperf-nulstr.c \
+       src/core/syscall-list.txt \
+       src/core/syscall-from-name.gperf \
+       src/core/syscall-from-name.h \
+       src/core/syscall-to-name.h
+
+src/core/syscall-list.txt: Makefile
+       $(AM_V_GEN)cpp -dM -include sys/syscall.h < /dev/null | $(AWK) '/^#define[ \t]+__NR_[^ ]+[ \t]+[0-9]/ { sub(/__NR_/, "", $$2); print $$2; }' > $@ || rm $@
+
+src/core/syscall-from-name.gperf: src/core/syscall-list.txt Makefile
+       $(AM_V_GEN)$(AWK) 'BEGIN{ print "struct syscall_name { const char* name; int id; };"; print "%null-strings"; print "%%";} { printf "%s, __NR_%s\n", $$1, $$1 }' < $< > $@
+
+src/core/syscall-from-name.h: src/core/syscall-from-name.gperf Makefile
+       $(AM_V_GEN)$(GPERF) -L ANSI-C -t --ignore-case -N lookup_syscall -H hash_syscall_name -p -C < $< > $@
+
+src/core/syscall-to-name.h: src/core/syscall-list.txt Makefile
+       $(AM_V_GEN)$(AWK) 'BEGIN{ print "const char* const syscall_names[] = { "} { printf "[__NR_%s] = \"%s\",\n", $$1, $$1 } END{print "};"}' < $< > $@
 
 # ------------------------------------------------------------------------------
 systemd_SOURCES = \
diff --git a/TODO b/TODO
index 0a923ce..c0381b8 100644 (file)
--- a/TODO
+++ b/TODO
@@ -33,12 +33,13 @@ Bugfixes:
   Jul 09 18:22:37 mop [21866]: Process 21865 (systemd) dumped core.
 
 Features:
+
+* use cpp -dM for key mapping too?
+
 * change mount access mode of 0700 or so for debugfs?
 
 * logind: wakelock/opportunistic suspend support
 
-* seccomp filters for services
-
 * switch-root: sockets need relabelling
 
 * segfault in journalctl during /var migration
@@ -60,8 +61,6 @@ Features:
 * load-fragment: when loading a unit file via a chain of symlinks
   verify that it isn't masked via any of the names traversed.
 
-* journald: _BOOT_ID triggers too many collisions.
-
 * journald: we currently rotate only after MaxUse+MaxFilesize has been reached.
 
 * nspawn: bind mount /var/log/journal from the host
@@ -236,8 +235,6 @@ Features:
 
 * write RPM spec macros for presets
 
-* journal: extend hash tables as we go
-
 * journal: API for looking for retrieving "all values of this field"
 
 * journal: deal nicely with byte-by-byte copied files, especially regards header
index c04db12..6e55d8d 100644 (file)
                                 shell pipelines.</para></listitem>
                         </varlistentry>
 
+                        <varlistentry>
+                                <term><varname>NoNewPrivileges=</varname></term>
+
+                                <listitem><para>Takes a boolean
+                                argument. If true ensures that the
+                                service process and all its children
+                                can never gain new privileges. This
+                                option is more powerful than the respective
+                                secure bits flags (see above), as it
+                                also prohibits UID changes of any
+                                kind. This is the simplest, most
+                                effective way to ensure that a process
+                                and its children can never elevate
+                                privileges again.</para></listitem>
+                        </varlistentry>
+
+                        <varlistentry>
+                                <term><varname>SystemCallFilter=</varname></term>
+
+                                <listitem><para>Takes a space
+                                separated list of system call
+                                names. If this setting is used all
+                                system calls executed by the unit
+                                process except for the listed ones
+                                will result in immediate process
+                                termination with the SIGSYS signal
+                                (whitelisting). If the first character
+                                of the list is <literal>~</literal>
+                                the effect is inverted: only the
+                                listed system calls will result in
+                                immediate process termination
+                                (blacklisting). If this option is used
+                                <varname>NoNewPrivileges=yes</varname>
+                                is implied. This feature makes use of
+                                the Secure Computing Mode 2 interfaces
+                                of the kernel ('seccomp filtering')
+                                and is useful for enforcing a minimal
+                                sandboxing environment. Note that the
+                                <function>execve</function>,
+                                <function>rt_sigreturn</function>,
+                                <function>sigreturn</function>,
+                                <function>exit_group</function>,
+                                <function>exit</function> system calls
+                                are implicitly whitelisted and don't
+                                need to be listed
+                                explicitly.</para></listitem>
+                        </varlistentry>
+
                 </variablelist>
         </refsect1>
 
index f293bbd..a763f72 100644 (file)
@@ -1,2 +1,6 @@
+/syscall-from-name.gperf
+/syscall-from-name.h
+/syscall-list.txt
+/syscall-to-name.h
 /macros.systemd
 /systemd.pc
index 9322cdf..a00ad50 100644 (file)
@@ -28,6 +28,7 @@
 #include "ioprio.h"
 #include "strv.h"
 #include "dbus-common.h"
+#include "syscall-list.h"
 
 DEFINE_BUS_PROPERTY_APPEND_ENUM(bus_execute_append_kill_mode, kill_mode, KillMode);
 
@@ -348,6 +349,32 @@ int bus_execute_append_command(DBusMessageIter *i, const char *property, void *d
         return 0;
 }
 
+int bus_execute_append_syscall_filter(DBusMessageIter *i, const char *property, void *data) {
+        ExecContext *c = data;
+        dbus_bool_t b;
+        DBusMessageIter sub;
+
+        assert(i);
+        assert(property);
+        assert(c);
+
+        if (!dbus_message_iter_open_container(i, DBUS_TYPE_ARRAY, "u", &sub))
+                return -ENOMEM;
+
+        if (c->syscall_filter)
+                b = dbus_message_iter_append_fixed_array(&sub, DBUS_TYPE_UINT32, &c->syscall_filter, (syscall_max() + 31) >> 4);
+        else
+                b = dbus_message_iter_append_fixed_array(&sub, DBUS_TYPE_UINT32, &c->syscall_filter, 0);
+
+        if (!b)
+                return -ENOMEM;
+
+        if (!dbus_message_iter_close_container(i, &sub))
+                return -ENOMEM;
+
+        return 0;
+}
+
 const BusProperty bus_exec_context_properties[] = {
         { "Environment",              bus_property_append_strv,             "as", offsetof(ExecContext, environment),            true },
         { "EnvironmentFiles",         bus_execute_append_env_files,      "a(sb)", offsetof(ExecContext, environment_files),      true },
@@ -409,6 +436,8 @@ const BusProperty bus_exec_context_properties[] = {
         { "UtmpIdentifier",           bus_property_append_string,            "s", offsetof(ExecContext, utmp_id),                true },
         { "ControlGroupModify",       bus_property_append_bool,              "b", offsetof(ExecContext, control_group_modify)         },
         { "ControlGroupPersistent",   bus_property_append_tristate_false,    "b", offsetof(ExecContext, control_group_persistent)     },
-        { "IgnoreSIGPIPE",            bus_property_append_bool,              "b", offsetof(ExecContext, ignore_sigpipe          )     },
+        { "IgnoreSIGPIPE",            bus_property_append_bool,              "b", offsetof(ExecContext, ignore_sigpipe)               },
+        { "NoNewPrivileges",          bus_property_append_bool,              "b", offsetof(ExecContext, no_new_privileges)            },
+        { "SystemCallFilter",         bus_execute_append_syscall_filter,    "au", 0                                                   },
         { NULL, }
 };
index b8bbe1c..dc267e6 100644 (file)
@@ -96,7 +96,9 @@
         "  <property name=\"ControlGroupModify\" type=\"b\" access=\"read\"/>\n" \
         "  <property name=\"ControlGroupPersistent\" type=\"b\" access=\"read\"/>\n" \
         "  <property name=\"PrivateNetwork\" type=\"b\" access=\"read\"/>\n" \
-        "  <property name=\"IgnoreSIGPIPE\" type=\"b\" access=\"read\"/>\n"
+        "  <property name=\"IgnoreSIGPIPE\" type=\"b\" access=\"read\"/>\n" \
+        "  <property name=\"NoNewPrivileges\" type=\"b\" access=\"read\"/>\n" \
+        "  <property name=\"SystemCallFilter\" type=\"au\" access=\"read\"/>\n"
 
 #define BUS_EXEC_COMMAND_INTERFACE(name)                             \
         "  <property name=\"" name "\" type=\"a(sasbttuii)\" access=\"read\"/>\n"
@@ -121,5 +123,6 @@ int bus_execute_append_rlimits(DBusMessageIter *i, const char *property, void *d
 int bus_execute_append_command(DBusMessageIter *u, const char *property, void *data);
 int bus_execute_append_kill_mode(DBusMessageIter *i, const char *property, void *data);
 int bus_execute_append_env_files(DBusMessageIter *i, const char *property, void *data);
+int bus_execute_append_syscall_filter(DBusMessageIter *i, const char *property, void *data);
 
 #endif
index daba1a3..7a72aa4 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/fs.h>
 #include <linux/oom.h>
 #include <sys/poll.h>
+#include <linux/seccomp-bpf.h>
 
 #ifdef HAVE_PAM
 #include <security/pam_appl.h>
@@ -60,6 +61,7 @@
 #include "def.h"
 #include "loopback-setup.h"
 #include "path-util.h"
+#include "syscall-list.h"
 
 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 
@@ -924,6 +926,59 @@ static void rename_process_from_path(const char *path) {
         rename_process(process_name);
 }
 
+static int apply_seccomp(uint32_t *syscall_filter) {
+        static const struct sock_filter header[] = {
+                VALIDATE_ARCHITECTURE,
+                EXAMINE_SYSCALL
+        };
+        static const struct sock_filter footer[] = {
+                _KILL_PROCESS
+        };
+
+        int i;
+        unsigned n;
+        struct sock_filter *f;
+        struct sock_fprog prog;
+
+        assert(syscall_filter);
+
+        /* First: count the syscalls to check for */
+        for (i = 0, n = 0; i < syscall_max(); i++)
+                if (syscall_filter[i >> 4] & (1 << (i & 31)))
+                        n++;
+
+        /* Second: build the filter program from a header the syscall
+         * matches and the footer */
+        f = alloca(sizeof(struct sock_filter) * (ELEMENTSOF(header) + 2*n + ELEMENTSOF(footer)));
+        memcpy(f, header, sizeof(header));
+
+        for (i = 0, n = 0; i < syscall_max(); i++)
+                if (syscall_filter[i >> 4] & (1 << (i & 31))) {
+                        struct sock_filter item[] = {
+                                BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, i, 0, 1),
+                                BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
+                        };
+
+                        assert_cc(ELEMENTSOF(item) == 2);
+
+                        f[ELEMENTSOF(header) + 2*n]  = item[0];
+                        f[ELEMENTSOF(header) + 2*n+1] = item[1];
+
+                        n++;
+                }
+
+        memcpy(f + (ELEMENTSOF(header) + 2*n), footer, sizeof(footer));
+
+        /* Third: install the filter */
+        zero(prog);
+        prog.len = ELEMENTSOF(header) + ELEMENTSOF(footer) + 2*n;
+        prog.filter = f;
+        if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+                return -errno;
+
+        return 0;
+}
+
 int exec_spawn(ExecCommand *command,
                char **argv,
                const ExecContext *context,
@@ -1355,6 +1410,21 @@ int exec_spawn(ExecCommand *command,
                                         r = EXIT_CAPABILITIES;
                                         goto fail_child;
                                 }
+
+                        if (context->no_new_privileges)
+                                if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+                                        err = -errno;
+                                        r = EXIT_NO_NEW_PRIVILEGES;
+                                        goto fail_child;
+                                }
+
+                        if (context->syscall_filter) {
+                                err = apply_seccomp(context->syscall_filter);
+                                if (err < 0) {
+                                        r = EXIT_SECCOMP;
+                                        goto fail_child;
+                                }
+                        }
                 }
 
                 if (!(our_env = new0(char*, 7))) {
index 2083c29..187165c 100644 (file)
@@ -164,6 +164,8 @@ struct ExecContext {
         bool private_tmp;
         bool private_network;
 
+        bool no_new_privileges;
+
         bool control_group_modify;
         int control_group_persistent;
 
@@ -174,6 +176,8 @@ struct ExecContext {
          * don't enter a trigger loop. */
         bool same_pgrp;
 
+        uint32_t *syscall_filter;
+
         bool oom_score_adjust_set:1;
         bool nice_set:1;
         bool ioprio_set:1;
index 192c2b2..140cb9c 100644 (file)
@@ -48,6 +48,8 @@ $1.Capabilities,                 config_parse_exec_capabilities,     0,
 $1.SecureBits,                   config_parse_exec_secure_bits,      0,                             offsetof($1, exec_context)
 $1.CapabilityBoundingSet,        config_parse_bounding_set,          0,                             offsetof($1, exec_context.capability_bounding_set_drop)
 $1.TimerSlackNSec,               config_parse_nsec,                  0,                             offsetof($1, exec_context.timer_slack_nsec)
+$1.NoNewPrivileges               config_parse_bool,                  0,                             offsetof($1, exec_context.no_new_privileges)
+$1.SystemCallFilter,             config_parse_syscall_filter,        0,                             offsetof($1, exec_context)
 $1.LimitCPU,                     config_parse_limit,                 RLIMIT_CPU,                    offsetof($1, exec_context.rlimit)
 $1.LimitFSIZE,                   config_parse_limit,                 RLIMIT_FSIZE,                  offsetof($1, exec_context.rlimit)
 $1.LimitDATA,                    config_parse_limit,                 RLIMIT_DATA,                   offsetof($1, exec_context.rlimit)
index 748ab55..7fcd63a 100644 (file)
@@ -45,6 +45,7 @@
 #include "bus-errors.h"
 #include "utf8.h"
 #include "path-util.h"
+#include "syscall-list.h"
 
 #ifndef HAVE_SYSV_COMPAT
 int config_parse_warn_compat(
@@ -879,7 +880,7 @@ int config_parse_bounding_set(
 
                 if (r < 0) {
                         log_error("[%s:%u] Failed to parse capability bounding set, ignoring: %s", filename, line, rvalue);
-                        return 0;
+                        continue;
                 }
 
                 sum |= ((uint64_t) 1ULL) << (uint64_t) cap;
@@ -2001,6 +2002,88 @@ int config_parse_documentation(
         return r;
 }
 
+static void syscall_set(uint32_t *p, int nr) {
+        p[nr >> 4] |= 1 << (nr & 31);
+}
+
+static void syscall_unset(uint32_t *p, int nr) {
+        p[nr >> 4] &= ~(1 << (nr & 31));
+}
+
+int config_parse_syscall_filter(
+                const char *filename,
+                unsigned line,
+                const char *section,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = data;
+        Unit *u = userdata;
+        bool invert;
+        char *w;
+        size_t l;
+        char *state;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(u);
+
+        if (rvalue[0] == '~') {
+                invert = true;
+                rvalue++;
+        }
+
+        if (!c->syscall_filter) {
+                size_t n;
+
+                n = (syscall_max() + 31) >> 4;
+                c->syscall_filter = new(uint32_t, n);
+                if (!c->syscall_filter)
+                        return -ENOMEM;
+
+                memset(c->syscall_filter, invert ? 0xFF : 0, n * sizeof(uint32_t));
+
+                /* Add these by default */
+                syscall_set(c->syscall_filter, __NR_execve);
+                syscall_set(c->syscall_filter, __NR_rt_sigreturn);
+#ifdef __NR_sigreturn
+                syscall_set(c->syscall_filter, __NR_sigreturn);
+#endif
+                syscall_set(c->syscall_filter, __NR_exit_group);
+                syscall_set(c->syscall_filter, __NR_exit);
+        }
+
+        FOREACH_WORD_QUOTED(w, l, rvalue, state) {
+                int id;
+                char *t;
+
+                t = strndup(w, l);
+                if (!t)
+                        return -ENOMEM;
+
+                id = syscall_from_name(t);
+                free(t);
+
+                if (id < 0)  {
+                        log_error("[%s:%u] Failed to parse syscall, ignoring: %s", filename, line, rvalue);
+                        continue;
+                }
+
+                if (invert)
+                        syscall_unset(c->syscall_filter, id);
+                else
+                        syscall_set(c->syscall_filter, id);
+        }
+
+        c->no_new_privileges = true;
+
+        return 0;
+}
+
 #define FOLLOW_MAX 8
 
 static int open_follow(char **filename, FILE **_f, Set *names, char **_final) {
index 501ea4a..543e329 100644 (file)
@@ -82,6 +82,7 @@ int config_parse_unit_device_allow(const char *filename, unsigned line, const ch
 int config_parse_unit_blkio_weight(const char *filename, unsigned line, const char *section, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
 int config_parse_unit_blkio_bandwidth(const char *filename, unsigned line, const char *section, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
 int config_parse_unit_requires_mounts_for(const char *filename, unsigned line, const char *section, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
+int config_parse_syscall_filter(const char *filename, unsigned line, const char *section, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
 
 /* gperf prototypes */
 const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, unsigned length);
diff --git a/src/core/syscall-list.c b/src/core/syscall-list.c
new file mode 100644 (file)
index 0000000..05fad3e
--- /dev/null
@@ -0,0 +1,55 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+  This file is part of systemd.
+
+  Copyright 2012 Lennart Poettering
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/syscall.h>
+#include <string.h>
+
+#include "util.h"
+
+#include "syscall-list.h"
+
+const struct syscall_name *lookup_syscall(register const char *str, register unsigned int len);
+
+#include "syscall-to-name.h"
+#include "syscall-from-name.h"
+
+const char *syscall_to_name(int id) {
+        if (id < 0 || id >= (int) ELEMENTSOF(syscall_names))
+                return NULL;
+
+        return syscall_names[id];
+}
+
+int syscall_from_name(const char *name) {
+        const struct syscall_name *sc;
+
+        assert(name);
+
+        sc = lookup_syscall(name, strlen(name));
+        if (!sc)
+                return -1;
+
+        return sc->id;
+}
+
+int syscall_max(void) {
+        return ELEMENTSOF(syscall_names);
+}
diff --git a/src/core/syscall-list.h b/src/core/syscall-list.h
new file mode 100644 (file)
index 0000000..0fc6859
--- /dev/null
@@ -0,0 +1,30 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+#ifndef foosyscalllisthfoo
+#define foosyscalllisthfoo
+
+/***
+  This file is part of systemd.
+
+  Copyright 2012 Lennart Poettering
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+const char *syscall_to_name(int id);
+int syscall_from_name(const char *name);
+
+int syscall_max(void);
+
+#endif
index 3f5ce97..7d57680 100644 (file)
@@ -1083,6 +1083,29 @@ int generic_print_property(const char *name, DBusMessageIter *iter, bool all) {
                         }
 
                         return 1;
+
+                } else if (dbus_message_iter_get_element_type(iter) == DBUS_TYPE_UINT32) {
+                        DBusMessageIter sub;
+
+                        dbus_message_iter_recurse(iter, &sub);
+                        if (all ||
+                            dbus_message_iter_get_arg_type(&sub) != DBUS_TYPE_INVALID) {
+                                printf("%s=", name);
+
+                                while (dbus_message_iter_get_arg_type(&sub) != DBUS_TYPE_INVALID) {
+                                        uint32_t u;
+
+                                        assert(dbus_message_iter_get_arg_type(&sub) == DBUS_TYPE_UINT32);
+                                        dbus_message_iter_get_basic(&sub, &u);
+                                        printf("%08x", u);
+
+                                        dbus_message_iter_next(&sub);
+                                }
+
+                                puts("");
+                        }
+
+                        return 1;
                 }
 
                 break;
index b07a66a..0dc82b2 100644 (file)
@@ -122,6 +122,12 @@ const char* exit_status_to_string(ExitStatus status, ExitStatusLevel level) {
 
                 case EXIT_NAMESPACE:
                         return "NAMESPACE";
+
+                case EXIT_NO_NEW_PRIVILEGES:
+                        return "NO_NEW_PRIVILEGES";
+
+                case EXIT_SECCOMP:
+                        return "SECCOMP";
                 }
         }
 
index 349e24f..813f1ce 100644 (file)
@@ -66,8 +66,9 @@ typedef enum ExitStatus {
         EXIT_TCPWRAP,
         EXIT_PAM,
         EXIT_NETWORK,
-        EXIT_NAMESPACE
-
+        EXIT_NAMESPACE,
+        EXIT_NO_NEW_PRIVILEGES,
+        EXIT_SECCOMP
 } ExitStatus;
 
 typedef enum ExitStatusLevel {
diff --git a/src/shared/linux/seccomp-bpf.h b/src/shared/linux/seccomp-bpf.h
new file mode 100644 (file)
index 0000000..1e3d136
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * seccomp example for x86 (32-bit and 64-bit) with BPF macros
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Authors:
+ *  Will Drewry <wad@chromium.org>
+ *  Kees Cook <keescook@chromium.org>
+ *
+ * The code may be used by anyone for any purpose, and can serve as a
+ * starting point for developing applications using mode 2 seccomp.
+ */
+#ifndef _SECCOMP_BPF_H_
+#define _SECCOMP_BPF_H_
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/prctl.h>
+
+#include <linux/unistd.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+
+#ifndef SECCOMP_MODE_FILTER
+# define SECCOMP_MODE_FILTER   2 /* uses user-supplied filter. */
+# define SECCOMP_RET_KILL      0x00000000U /* kill the task immediately */
+# define SECCOMP_RET_TRAP      0x00030000U /* disallow and force a SIGSYS */
+# define SECCOMP_RET_ALLOW     0x7fff0000U /* allow */
+struct seccomp_data {
+    int nr;
+    __u32 arch;
+    __u64 instruction_pointer;
+    __u64 args[6];
+};
+#endif
+#ifndef SYS_SECCOMP
+# define SYS_SECCOMP 1
+#endif
+
+#define syscall_nr (offsetof(struct seccomp_data, nr))
+#define arch_nr (offsetof(struct seccomp_data, arch))
+
+#if defined(__i386__)
+# define REG_SYSCALL   REG_EAX
+# define ARCH_NR       AUDIT_ARCH_I386
+#elif defined(__x86_64__)
+# define REG_SYSCALL   REG_RAX
+# define ARCH_NR       AUDIT_ARCH_X86_64
+#else
+# warning "Platform does not support seccomp filter yet"
+# define REG_SYSCALL   0
+# define ARCH_NR       0
+#endif
+
+#define VALIDATE_ARCHITECTURE \
+       BPF_STMT(BPF_LD+BPF_W+BPF_ABS, arch_nr), \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ARCH_NR, 1, 0), \
+       BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+
+#define EXAMINE_SYSCALL \
+       BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr)
+
+#define ALLOW_SYSCALL(name) \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
+       BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
+
+#define _KILL_PROCESS \
+       BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+
+#endif /* _SECCOMP_BPF_H_ */
diff --git a/src/shared/linux/seccomp.h b/src/shared/linux/seccomp.h
new file mode 100644 (file)
index 0000000..9c03683
--- /dev/null
@@ -0,0 +1,47 @@
+#ifndef _LINUX_SECCOMP_H
+#define _LINUX_SECCOMP_H
+
+
+#include <linux/types.h>
+
+
+/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
+#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
+#define SECCOMP_MODE_STRICT    1 /* uses hard-coded filter. */
+#define SECCOMP_MODE_FILTER    2 /* uses user-supplied filter. */
+
+/*
+ * All BPF programs must return a 32-bit value.
+ * The bottom 16-bits are for optional return data.
+ * The upper 16-bits are ordered from least permissive values to most.
+ *
+ * The ordering ensures that a min_t() over composed return values always
+ * selects the least permissive choice.
+ */
+#define SECCOMP_RET_KILL       0x00000000U /* kill the task immediately */
+#define SECCOMP_RET_TRAP       0x00030000U /* disallow and force a SIGSYS */
+#define SECCOMP_RET_ERRNO      0x00050000U /* returns an errno */
+#define SECCOMP_RET_TRACE      0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_ALLOW      0x7fff0000U /* allow */
+
+/* Masks for the return value sections. */
+#define SECCOMP_RET_ACTION     0x7fff0000U
+#define SECCOMP_RET_DATA       0x0000ffffU
+
+/**
+ * struct seccomp_data - the format the BPF program executes over.
+ * @nr: the system call number
+ * @arch: indicates system call convention as an AUDIT_ARCH_* value
+ *        as defined in <linux/audit.h>.
+ * @instruction_pointer: at the time of the system call.
+ * @args: up to 6 system call arguments always stored as 64-bit values
+ *        regardless of the architecture.
+ */
+struct seccomp_data {
+       int nr;
+       __u32 arch;
+       __u64 instruction_pointer;
+       __u64 args[6];
+};
+
+#endif /* _LINUX_SECCOMP_H */
index 0cf7949..d918c4e 100644 (file)
@@ -188,4 +188,8 @@ static inline pid_t gettid(void) {
 #define MS_STRICTATIME (1<<24)
 #endif
 
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+
 #endif