From: Lennart Poettering Date: Tue, 17 Jul 2012 02:17:53 +0000 (+0200) Subject: execute: support syscall filtering using seccomp filters X-Git-Tag: v187~53 X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~ianmdlvl/git?p=elogind.git;a=commitdiff_plain;h=8351ceaea9480d9c2979aa2ff0f4982cfdfef58d execute: support syscall filtering using seccomp filters --- diff --git a/Makefile.am b/Makefile.am index cf911a0fe..b16c01ae6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -961,11 +961,15 @@ libsystemd_core_la_SOURCES = \ src/core/switch-root.h \ src/core/switch-root.c \ src/core/killall.h \ - src/core/killall.c + src/core/killall.c \ + src/core/syscall-list.c \ + src/core/syscall-list.h nodist_libsystemd_core_la_SOURCES = \ src/core/load-fragment-gperf.c \ - src/core/load-fragment-gperf-nulstr.c + src/core/load-fragment-gperf-nulstr.c \ + src/core/syscall-from-name.h \ + src/core/syscall-to-name.h libsystemd_core_la_CFLAGS = \ $(AM_CFLAGS) \ @@ -998,7 +1002,23 @@ EXTRA_DIST += \ CLEANFILES += \ src/core/load-fragment-gperf.gperf \ src/core/load-fragment-gperf.c \ - src/core/load-fragment-gperf-nulstr.c + src/core/load-fragment-gperf-nulstr.c \ + src/core/syscall-list.txt \ + src/core/syscall-from-name.gperf \ + src/core/syscall-from-name.h \ + src/core/syscall-to-name.h + +src/core/syscall-list.txt: Makefile + $(AM_V_GEN)cpp -dM -include sys/syscall.h < /dev/null | $(AWK) '/^#define[ \t]+__NR_[^ ]+[ \t]+[0-9]/ { sub(/__NR_/, "", $$2); print $$2; }' > $@ || rm $@ + +src/core/syscall-from-name.gperf: src/core/syscall-list.txt Makefile + $(AM_V_GEN)$(AWK) 'BEGIN{ print "struct syscall_name { const char* name; int id; };"; print "%null-strings"; print "%%";} { printf "%s, __NR_%s\n", $$1, $$1 }' < $< > $@ + +src/core/syscall-from-name.h: src/core/syscall-from-name.gperf Makefile + $(AM_V_GEN)$(GPERF) -L ANSI-C -t --ignore-case -N lookup_syscall -H hash_syscall_name -p -C < $< > $@ + +src/core/syscall-to-name.h: src/core/syscall-list.txt Makefile + $(AM_V_GEN)$(AWK) 'BEGIN{ print "const char* const syscall_names[] = { "} { printf "[__NR_%s] = \"%s\",\n", $$1, $$1 } END{print "};"}' < $< > $@ # ------------------------------------------------------------------------------ systemd_SOURCES = \ diff --git a/TODO b/TODO index 0a923ce30..c0381b88a 100644 --- a/TODO +++ b/TODO @@ -33,12 +33,13 @@ Bugfixes: Jul 09 18:22:37 mop [21866]: Process 21865 (systemd) dumped core. Features: + +* use cpp -dM for key mapping too? + * change mount access mode of 0700 or so for debugfs? * logind: wakelock/opportunistic suspend support -* seccomp filters for services - * switch-root: sockets need relabelling * segfault in journalctl during /var migration @@ -60,8 +61,6 @@ Features: * load-fragment: when loading a unit file via a chain of symlinks verify that it isn't masked via any of the names traversed. -* journald: _BOOT_ID triggers too many collisions. - * journald: we currently rotate only after MaxUse+MaxFilesize has been reached. * nspawn: bind mount /var/log/journal from the host @@ -236,8 +235,6 @@ Features: * write RPM spec macros for presets -* journal: extend hash tables as we go - * journal: API for looking for retrieving "all values of this field" * journal: deal nicely with byte-by-byte copied files, especially regards header diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index c04db12e3..6e55d8dfc 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1091,6 +1091,54 @@ shell pipelines. + + NoNewPrivileges= + + Takes a boolean + argument. If true ensures that the + service process and all its children + can never gain new privileges. This + option is more powerful than the respective + secure bits flags (see above), as it + also prohibits UID changes of any + kind. This is the simplest, most + effective way to ensure that a process + and its children can never elevate + privileges again. + + + + SystemCallFilter= + + Takes a space + separated list of system call + names. If this setting is used all + system calls executed by the unit + process except for the listed ones + will result in immediate process + termination with the SIGSYS signal + (whitelisting). If the first character + of the list is ~ + the effect is inverted: only the + listed system calls will result in + immediate process termination + (blacklisting). If this option is used + NoNewPrivileges=yes + is implied. This feature makes use of + the Secure Computing Mode 2 interfaces + of the kernel ('seccomp filtering') + and is useful for enforcing a minimal + sandboxing environment. Note that the + execve, + rt_sigreturn, + sigreturn, + exit_group, + exit system calls + are implicitly whitelisted and don't + need to be listed + explicitly. + + diff --git a/src/core/.gitignore b/src/core/.gitignore index f293bbdc9..a763f7250 100644 --- a/src/core/.gitignore +++ b/src/core/.gitignore @@ -1,2 +1,6 @@ +/syscall-from-name.gperf +/syscall-from-name.h +/syscall-list.txt +/syscall-to-name.h /macros.systemd /systemd.pc diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 9322cdfd8..a00ad5079 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -28,6 +28,7 @@ #include "ioprio.h" #include "strv.h" #include "dbus-common.h" +#include "syscall-list.h" DEFINE_BUS_PROPERTY_APPEND_ENUM(bus_execute_append_kill_mode, kill_mode, KillMode); @@ -348,6 +349,32 @@ int bus_execute_append_command(DBusMessageIter *i, const char *property, void *d return 0; } +int bus_execute_append_syscall_filter(DBusMessageIter *i, const char *property, void *data) { + ExecContext *c = data; + dbus_bool_t b; + DBusMessageIter sub; + + assert(i); + assert(property); + assert(c); + + if (!dbus_message_iter_open_container(i, DBUS_TYPE_ARRAY, "u", &sub)) + return -ENOMEM; + + if (c->syscall_filter) + b = dbus_message_iter_append_fixed_array(&sub, DBUS_TYPE_UINT32, &c->syscall_filter, (syscall_max() + 31) >> 4); + else + b = dbus_message_iter_append_fixed_array(&sub, DBUS_TYPE_UINT32, &c->syscall_filter, 0); + + if (!b) + return -ENOMEM; + + if (!dbus_message_iter_close_container(i, &sub)) + return -ENOMEM; + + return 0; +} + const BusProperty bus_exec_context_properties[] = { { "Environment", bus_property_append_strv, "as", offsetof(ExecContext, environment), true }, { "EnvironmentFiles", bus_execute_append_env_files, "a(sb)", offsetof(ExecContext, environment_files), true }, @@ -409,6 +436,8 @@ const BusProperty bus_exec_context_properties[] = { { "UtmpIdentifier", bus_property_append_string, "s", offsetof(ExecContext, utmp_id), true }, { "ControlGroupModify", bus_property_append_bool, "b", offsetof(ExecContext, control_group_modify) }, { "ControlGroupPersistent", bus_property_append_tristate_false, "b", offsetof(ExecContext, control_group_persistent) }, - { "IgnoreSIGPIPE", bus_property_append_bool, "b", offsetof(ExecContext, ignore_sigpipe ) }, + { "IgnoreSIGPIPE", bus_property_append_bool, "b", offsetof(ExecContext, ignore_sigpipe) }, + { "NoNewPrivileges", bus_property_append_bool, "b", offsetof(ExecContext, no_new_privileges) }, + { "SystemCallFilter", bus_execute_append_syscall_filter, "au", 0 }, { NULL, } }; diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h index b8bbe1c9f..dc267e6cc 100644 --- a/src/core/dbus-execute.h +++ b/src/core/dbus-execute.h @@ -96,7 +96,9 @@ " \n" \ " \n" \ " \n" \ - " \n" + " \n" \ + " \n" \ + " \n" #define BUS_EXEC_COMMAND_INTERFACE(name) \ " \n" @@ -121,5 +123,6 @@ int bus_execute_append_rlimits(DBusMessageIter *i, const char *property, void *d int bus_execute_append_command(DBusMessageIter *u, const char *property, void *data); int bus_execute_append_kill_mode(DBusMessageIter *i, const char *property, void *data); int bus_execute_append_env_files(DBusMessageIter *i, const char *property, void *data); +int bus_execute_append_syscall_filter(DBusMessageIter *i, const char *property, void *data); #endif diff --git a/src/core/execute.c b/src/core/execute.c index daba1a384..7a72aa486 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -38,6 +38,7 @@ #include #include #include +#include #ifdef HAVE_PAM #include @@ -60,6 +61,7 @@ #include "def.h" #include "loopback-setup.h" #include "path-util.h" +#include "syscall-list.h" #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC) @@ -924,6 +926,59 @@ static void rename_process_from_path(const char *path) { rename_process(process_name); } +static int apply_seccomp(uint32_t *syscall_filter) { + static const struct sock_filter header[] = { + VALIDATE_ARCHITECTURE, + EXAMINE_SYSCALL + }; + static const struct sock_filter footer[] = { + _KILL_PROCESS + }; + + int i; + unsigned n; + struct sock_filter *f; + struct sock_fprog prog; + + assert(syscall_filter); + + /* First: count the syscalls to check for */ + for (i = 0, n = 0; i < syscall_max(); i++) + if (syscall_filter[i >> 4] & (1 << (i & 31))) + n++; + + /* Second: build the filter program from a header the syscall + * matches and the footer */ + f = alloca(sizeof(struct sock_filter) * (ELEMENTSOF(header) + 2*n + ELEMENTSOF(footer))); + memcpy(f, header, sizeof(header)); + + for (i = 0, n = 0; i < syscall_max(); i++) + if (syscall_filter[i >> 4] & (1 << (i & 31))) { + struct sock_filter item[] = { + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, i, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) + }; + + assert_cc(ELEMENTSOF(item) == 2); + + f[ELEMENTSOF(header) + 2*n] = item[0]; + f[ELEMENTSOF(header) + 2*n+1] = item[1]; + + n++; + } + + memcpy(f + (ELEMENTSOF(header) + 2*n), footer, sizeof(footer)); + + /* Third: install the filter */ + zero(prog); + prog.len = ELEMENTSOF(header) + ELEMENTSOF(footer) + 2*n; + prog.filter = f; + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0) + return -errno; + + return 0; +} + int exec_spawn(ExecCommand *command, char **argv, const ExecContext *context, @@ -1355,6 +1410,21 @@ int exec_spawn(ExecCommand *command, r = EXIT_CAPABILITIES; goto fail_child; } + + if (context->no_new_privileges) + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { + err = -errno; + r = EXIT_NO_NEW_PRIVILEGES; + goto fail_child; + } + + if (context->syscall_filter) { + err = apply_seccomp(context->syscall_filter); + if (err < 0) { + r = EXIT_SECCOMP; + goto fail_child; + } + } } if (!(our_env = new0(char*, 7))) { diff --git a/src/core/execute.h b/src/core/execute.h index 2083c2971..187165cdc 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -164,6 +164,8 @@ struct ExecContext { bool private_tmp; bool private_network; + bool no_new_privileges; + bool control_group_modify; int control_group_persistent; @@ -174,6 +176,8 @@ struct ExecContext { * don't enter a trigger loop. */ bool same_pgrp; + uint32_t *syscall_filter; + bool oom_score_adjust_set:1; bool nice_set:1; bool ioprio_set:1; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 192c2b278..140cb9c0a 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -48,6 +48,8 @@ $1.Capabilities, config_parse_exec_capabilities, 0, $1.SecureBits, config_parse_exec_secure_bits, 0, offsetof($1, exec_context) $1.CapabilityBoundingSet, config_parse_bounding_set, 0, offsetof($1, exec_context.capability_bounding_set_drop) $1.TimerSlackNSec, config_parse_nsec, 0, offsetof($1, exec_context.timer_slack_nsec) +$1.NoNewPrivileges config_parse_bool, 0, offsetof($1, exec_context.no_new_privileges) +$1.SystemCallFilter, config_parse_syscall_filter, 0, offsetof($1, exec_context) $1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit) $1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit) $1.LimitDATA, config_parse_limit, RLIMIT_DATA, offsetof($1, exec_context.rlimit) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 748ab55d5..7fcd63a17 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -45,6 +45,7 @@ #include "bus-errors.h" #include "utf8.h" #include "path-util.h" +#include "syscall-list.h" #ifndef HAVE_SYSV_COMPAT int config_parse_warn_compat( @@ -879,7 +880,7 @@ int config_parse_bounding_set( if (r < 0) { log_error("[%s:%u] Failed to parse capability bounding set, ignoring: %s", filename, line, rvalue); - return 0; + continue; } sum |= ((uint64_t) 1ULL) << (uint64_t) cap; @@ -2001,6 +2002,88 @@ int config_parse_documentation( return r; } +static void syscall_set(uint32_t *p, int nr) { + p[nr >> 4] |= 1 << (nr & 31); +} + +static void syscall_unset(uint32_t *p, int nr) { + p[nr >> 4] &= ~(1 << (nr & 31)); +} + +int config_parse_syscall_filter( + const char *filename, + unsigned line, + const char *section, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + bool invert; + char *w; + size_t l; + char *state; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->syscall_filter) { + size_t n; + + n = (syscall_max() + 31) >> 4; + c->syscall_filter = new(uint32_t, n); + if (!c->syscall_filter) + return -ENOMEM; + + memset(c->syscall_filter, invert ? 0xFF : 0, n * sizeof(uint32_t)); + + /* Add these by default */ + syscall_set(c->syscall_filter, __NR_execve); + syscall_set(c->syscall_filter, __NR_rt_sigreturn); +#ifdef __NR_sigreturn + syscall_set(c->syscall_filter, __NR_sigreturn); +#endif + syscall_set(c->syscall_filter, __NR_exit_group); + syscall_set(c->syscall_filter, __NR_exit); + } + + FOREACH_WORD_QUOTED(w, l, rvalue, state) { + int id; + char *t; + + t = strndup(w, l); + if (!t) + return -ENOMEM; + + id = syscall_from_name(t); + free(t); + + if (id < 0) { + log_error("[%s:%u] Failed to parse syscall, ignoring: %s", filename, line, rvalue); + continue; + } + + if (invert) + syscall_unset(c->syscall_filter, id); + else + syscall_set(c->syscall_filter, id); + } + + c->no_new_privileges = true; + + return 0; +} + #define FOLLOW_MAX 8 static int open_follow(char **filename, FILE **_f, Set *names, char **_final) { diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 501ea4ad4..543e32968 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -82,6 +82,7 @@ int config_parse_unit_device_allow(const char *filename, unsigned line, const ch int config_parse_unit_blkio_weight(const char *filename, unsigned line, const char *section, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_unit_blkio_bandwidth(const char *filename, unsigned line, const char *section, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_unit_requires_mounts_for(const char *filename, unsigned line, const char *section, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_syscall_filter(const char *filename, unsigned line, const char *section, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); /* gperf prototypes */ const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, unsigned length); diff --git a/src/core/syscall-list.c b/src/core/syscall-list.c new file mode 100644 index 000000000..05fad3e15 --- /dev/null +++ b/src/core/syscall-list.c @@ -0,0 +1,55 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2012 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +#include "util.h" + +#include "syscall-list.h" + +const struct syscall_name *lookup_syscall(register const char *str, register unsigned int len); + +#include "syscall-to-name.h" +#include "syscall-from-name.h" + +const char *syscall_to_name(int id) { + if (id < 0 || id >= (int) ELEMENTSOF(syscall_names)) + return NULL; + + return syscall_names[id]; +} + +int syscall_from_name(const char *name) { + const struct syscall_name *sc; + + assert(name); + + sc = lookup_syscall(name, strlen(name)); + if (!sc) + return -1; + + return sc->id; +} + +int syscall_max(void) { + return ELEMENTSOF(syscall_names); +} diff --git a/src/core/syscall-list.h b/src/core/syscall-list.h new file mode 100644 index 000000000..0fc685960 --- /dev/null +++ b/src/core/syscall-list.h @@ -0,0 +1,30 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#ifndef foosyscalllisthfoo +#define foosyscalllisthfoo + +/*** + This file is part of systemd. + + Copyright 2012 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +const char *syscall_to_name(int id); +int syscall_from_name(const char *name); + +int syscall_max(void); + +#endif diff --git a/src/shared/dbus-common.c b/src/shared/dbus-common.c index 3f5ce9768..7d57680cf 100644 --- a/src/shared/dbus-common.c +++ b/src/shared/dbus-common.c @@ -1082,6 +1082,29 @@ int generic_print_property(const char *name, DBusMessageIter *iter, bool all) { puts(""); } + return 1; + + } else if (dbus_message_iter_get_element_type(iter) == DBUS_TYPE_UINT32) { + DBusMessageIter sub; + + dbus_message_iter_recurse(iter, &sub); + if (all || + dbus_message_iter_get_arg_type(&sub) != DBUS_TYPE_INVALID) { + printf("%s=", name); + + while (dbus_message_iter_get_arg_type(&sub) != DBUS_TYPE_INVALID) { + uint32_t u; + + assert(dbus_message_iter_get_arg_type(&sub) == DBUS_TYPE_UINT32); + dbus_message_iter_get_basic(&sub, &u); + printf("%08x", u); + + dbus_message_iter_next(&sub); + } + + puts(""); + } + return 1; } diff --git a/src/shared/exit-status.c b/src/shared/exit-status.c index b07a66a3e..0dc82b2e1 100644 --- a/src/shared/exit-status.c +++ b/src/shared/exit-status.c @@ -122,6 +122,12 @@ const char* exit_status_to_string(ExitStatus status, ExitStatusLevel level) { case EXIT_NAMESPACE: return "NAMESPACE"; + + case EXIT_NO_NEW_PRIVILEGES: + return "NO_NEW_PRIVILEGES"; + + case EXIT_SECCOMP: + return "SECCOMP"; } } diff --git a/src/shared/exit-status.h b/src/shared/exit-status.h index 349e24fbf..813f1ce1b 100644 --- a/src/shared/exit-status.h +++ b/src/shared/exit-status.h @@ -66,8 +66,9 @@ typedef enum ExitStatus { EXIT_TCPWRAP, EXIT_PAM, EXIT_NETWORK, - EXIT_NAMESPACE - + EXIT_NAMESPACE, + EXIT_NO_NEW_PRIVILEGES, + EXIT_SECCOMP } ExitStatus; typedef enum ExitStatusLevel { diff --git a/src/shared/linux/seccomp-bpf.h b/src/shared/linux/seccomp-bpf.h new file mode 100644 index 000000000..1e3d13673 --- /dev/null +++ b/src/shared/linux/seccomp-bpf.h @@ -0,0 +1,76 @@ +/* + * seccomp example for x86 (32-bit and 64-bit) with BPF macros + * + * Copyright (c) 2012 The Chromium OS Authors + * Authors: + * Will Drewry + * Kees Cook + * + * The code may be used by anyone for any purpose, and can serve as a + * starting point for developing applications using mode 2 seccomp. + */ +#ifndef _SECCOMP_BPF_H_ +#define _SECCOMP_BPF_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#ifndef SECCOMP_MODE_FILTER +# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ +# define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ +# define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ +# define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ +struct seccomp_data { + int nr; + __u32 arch; + __u64 instruction_pointer; + __u64 args[6]; +}; +#endif +#ifndef SYS_SECCOMP +# define SYS_SECCOMP 1 +#endif + +#define syscall_nr (offsetof(struct seccomp_data, nr)) +#define arch_nr (offsetof(struct seccomp_data, arch)) + +#if defined(__i386__) +# define REG_SYSCALL REG_EAX +# define ARCH_NR AUDIT_ARCH_I386 +#elif defined(__x86_64__) +# define REG_SYSCALL REG_RAX +# define ARCH_NR AUDIT_ARCH_X86_64 +#else +# warning "Platform does not support seccomp filter yet" +# define REG_SYSCALL 0 +# define ARCH_NR 0 +#endif + +#define VALIDATE_ARCHITECTURE \ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, arch_nr), \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ARCH_NR, 1, 0), \ + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) + +#define EXAMINE_SYSCALL \ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr) + +#define ALLOW_SYSCALL(name) \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \ + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) + +#define _KILL_PROCESS \ + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) + +#endif /* _SECCOMP_BPF_H_ */ diff --git a/src/shared/linux/seccomp.h b/src/shared/linux/seccomp.h new file mode 100644 index 000000000..9c03683fa --- /dev/null +++ b/src/shared/linux/seccomp.h @@ -0,0 +1,47 @@ +#ifndef _LINUX_SECCOMP_H +#define _LINUX_SECCOMP_H + + +#include + + +/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, ) */ +#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */ +#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */ +#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ + +/* + * All BPF programs must return a 32-bit value. + * The bottom 16-bits are for optional return data. + * The upper 16-bits are ordered from least permissive values to most. + * + * The ordering ensures that a min_t() over composed return values always + * selects the least permissive choice. + */ +#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ +#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ +#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ + +/* Masks for the return value sections. */ +#define SECCOMP_RET_ACTION 0x7fff0000U +#define SECCOMP_RET_DATA 0x0000ffffU + +/** + * struct seccomp_data - the format the BPF program executes over. + * @nr: the system call number + * @arch: indicates system call convention as an AUDIT_ARCH_* value + * as defined in . + * @instruction_pointer: at the time of the system call. + * @args: up to 6 system call arguments always stored as 64-bit values + * regardless of the architecture. + */ +struct seccomp_data { + int nr; + __u32 arch; + __u64 instruction_pointer; + __u64 args[6]; +}; + +#endif /* _LINUX_SECCOMP_H */ diff --git a/src/shared/missing.h b/src/shared/missing.h index 0cf7949d2..d918c4e9a 100644 --- a/src/shared/missing.h +++ b/src/shared/missing.h @@ -188,4 +188,8 @@ static inline pid_t gettid(void) { #define MS_STRICTATIME (1<<24) #endif +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif + #endif