From a931ad47a8623163a29d898224d8a8c1177ffdaf Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 5 Nov 2014 17:57:23 +0100 Subject: [PATCH 1/1] core: introduce new Delegate=yes/no property controlling creation of cgroup subhierarchies For priviliged units this resource control property ensures that the processes have all controllers systemd manages enabled. For unpriviliged services (those with User= set) this ensures that access rights to the service cgroup is granted to the user in question, to create further subgroups. Note that this only applies to the name=systemd hierarchy though, as access to other controllers is not safe for unpriviliged processes. Delegate=yes should be set for container scopes where a systemd instance inside the container shall manage the hierarchies below its own cgroup and have access to all controllers. Delegate=yes should also be set for user@.service, so that systemd --user can run, controlling its own cgroup tree. This commit changes machined, systemd-nspawn@.service and user@.service to set this boolean, in order to ensure that container management will just work, and the user systemd instance can run fine. --- man/systemd.resource-control.xml | 14 +++++++++ src/core/cgroup.c | 22 ++++++++++++-- src/core/cgroup.h | 2 ++ src/core/dbus-cgroup.c | 42 +++++++++++++++++++++++++++ src/core/execute.c | 22 ++++++++++++-- src/core/execute.h | 2 ++ src/core/load-fragment-gperf.gperf.m4 | 3 +- src/core/mount.c | 1 + src/core/service.c | 1 + src/core/socket.c | 1 + src/core/swap.c | 1 + src/machine/machined-dbus.c | 4 +++ src/shared/cgroup-util.h | 3 +- units/systemd-nspawn@.service.in | 1 + units/user@.service.in | 1 + 15 files changed, 112 insertions(+), 8 deletions(-) diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 968b328dd..218946d4e 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -394,6 +394,20 @@ along with systemd; If not, see . + + Delegate= + + + Turns on delegation of further resource control + partitioning to processes of the unit. For unpriviliged + services (i.e. those using the User= + setting) this allows processes to create a subhierarchy + beneath its control group path. For priviliged services and + scopes this ensures the processes will have all control + group controllers enabled. + + + diff --git a/src/core/cgroup.c b/src/core/cgroup.c index e604c3cbc..0951a0996 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -105,7 +105,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { "%sBlockIOWeight=%lu\n" "%sStartupBlockIOWeight=%lu\n" "%sMemoryLimit=%" PRIu64 "\n" - "%sDevicePolicy=%s\n", + "%sDevicePolicy=%s\n" + "%sDelegate=%s\n", prefix, yes_no(c->cpu_accounting), prefix, yes_no(c->blockio_accounting), prefix, yes_no(c->memory_accounting), @@ -115,7 +116,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { prefix, c->blockio_weight, prefix, c->startup_blockio_weight, prefix, c->memory_limit, - prefix, cgroup_device_policy_to_string(c->device_policy)); + prefix, cgroup_device_policy_to_string(c->device_policy), + prefix, yes_no(c->delegate)); LIST_FOREACH(device_allow, a, c->device_allow) fprintf(f, @@ -461,7 +463,8 @@ CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) { c->memory_limit != (uint64_t) -1) mask |= CGROUP_MEMORY; - if (c->device_allow || c->device_policy != CGROUP_AUTO) + if (c->device_allow || + c->device_policy != CGROUP_AUTO) mask |= CGROUP_DEVICE; return mask; @@ -474,6 +477,19 @@ CGroupControllerMask unit_get_cgroup_mask(Unit *u) { if (!c) return 0; + /* If delegation is turned on, then turn on all cgroups, + * unless the process we fork into it is known to drop + * privileges anyway, and shouldn't get access to the + * controllers anyway. */ + + if (c->delegate) { + ExecContext *e; + + e = unit_get_exec_context(u); + if (!e || exec_context_maintains_privileges(e)) + return _CGROUP_CONTROLLER_MASK_ALL; + } + return cgroup_context_get_mask(c); } diff --git a/src/core/cgroup.h b/src/core/cgroup.h index d299872b1..3c43885bf 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -83,6 +83,8 @@ struct CGroupContext { CGroupDevicePolicy device_policy; LIST_HEAD(CGroupDeviceAllow, device_allow); + + bool delegate; }; #include "unit.h" diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 900566c29..db998345e 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -153,6 +153,7 @@ static int property_get_ulong_as_u64( const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0), SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0), SD_BUS_PROPERTY("CPUShares", "t", property_get_ulong_as_u64, offsetof(CGroupContext, cpu_shares), 0), SD_BUS_PROPERTY("StartupCPUShares", "t", property_get_ulong_as_u64, offsetof(CGroupContext, startup_cpu_shares), 0), @@ -170,6 +171,39 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_VTABLE_END }; +static int bus_cgroup_set_transient_property( + Unit *u, + CGroupContext *c, + const char *name, + sd_bus_message *message, + UnitSetPropertiesMode mode, + sd_bus_error *error) { + + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + if (streq(name, "Delegate")) { + int b; + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (mode != UNIT_CHECK) { + c->delegate = b; + unit_write_drop_in_private(u, mode, name, b ? "Delegate=yes" : "Delegate=no"); + } + + return 1; + } + + return 0; +} + int bus_cgroup_set_property( Unit *u, CGroupContext *c, @@ -632,6 +666,14 @@ int bus_cgroup_set_property( } return 1; + + } + + if (u->transient && u->load_state == UNIT_STUB) { + r = bus_cgroup_set_transient_property(u, c, name, message, mode, error); + if (r != 0) + return r; + } return 0; diff --git a/src/core/execute.c b/src/core/execute.c index c41aec222..5cfd4a1f9 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1444,8 +1444,10 @@ static int exec_child(ExecCommand *command, } #endif -#ifdef HAVE_PAM - if (params->cgroup_path && context->user && context->pam_name) { + /* If delegation is enabled we'll pass ownership of the cgroup + * (but only in systemd's own controller hierarchy!) to the + * user of the new process. */ + if (params->cgroup_path && context->user && params->cgroup_delegate) { err = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid); if (err < 0) { *error = EXIT_CGROUP; @@ -1459,7 +1461,6 @@ static int exec_child(ExecCommand *command, return err; } } -#endif if (!strv_isempty(context->runtime_directory) && params->runtime_prefix) { char **rt; @@ -2402,6 +2403,21 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile); } +bool exec_context_maintains_privileges(ExecContext *c) { + assert(c); + + /* Returns true if the process forked off would run run under + * an unchanged UID or as root. */ + + if (!c->user) + return true; + + if (streq(c->user, "root") || streq(c->user, "0")) + return true; + + return false; +} + void exec_status_start(ExecStatus *s, pid_t pid) { assert(s); diff --git a/src/core/execute.h b/src/core/execute.h index c45dde53a..b16a24d0c 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -207,6 +207,7 @@ struct ExecParameters { bool selinux_context_net; CGroupControllerMask cgroup_supported; const char *cgroup_path; + bool cgroup_delegate; const char *runtime_prefix; const char *unit_id; usec_t watchdog_usec; @@ -244,6 +245,7 @@ int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_r int exec_context_load_environment(const ExecContext *c, const char *unit_id, char ***l); bool exec_context_may_touch_console(ExecContext *c); +bool exec_context_maintains_privileges(ExecContext *c); void exec_status_start(ExecStatus *s, pid_t pid); void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status); diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index ca0139479..5158a9f15 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -119,7 +119,8 @@ $1.BlockIOWeight, config_parse_blockio_weight, 0, $1.StartupBlockIOWeight, config_parse_blockio_weight, 0, offsetof($1, cgroup_context.startup_blockio_weight) $1.BlockIODeviceWeight, config_parse_blockio_device_weight, 0, offsetof($1, cgroup_context) $1.BlockIOReadBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context) -$1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context)' +$1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context) +$1.Delegate, config_parse_bool, 0, offsetof($1, cgroup_context.delegate)' )m4_dnl Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) Unit.Documentation, config_parse_documentation, 0, offsetof(Unit, documentation) diff --git a/src/core/mount.c b/src/core/mount.c index 01243c381..8b787f66b 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -715,6 +715,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { exec_params.confirm_spawn = UNIT(m)->manager->confirm_spawn; exec_params.cgroup_supported = UNIT(m)->manager->cgroup_supported; exec_params.cgroup_path = UNIT(m)->cgroup_path; + exec_params.cgroup_delegate = m->cgroup_context.delegate; exec_params.runtime_prefix = manager_get_runtime_prefix(UNIT(m)->manager); exec_params.unit_id = UNIT(m)->id; diff --git a/src/core/service.c b/src/core/service.c index f27e63eb9..6a27e8f67 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1000,6 +1000,7 @@ static int service_spawn( exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn; exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported; exec_params.cgroup_path = path; + exec_params.cgroup_delegate = s->cgroup_context.delegate; exec_params.runtime_prefix = manager_get_runtime_prefix(UNIT(s)->manager); exec_params.unit_id = UNIT(s)->id; exec_params.watchdog_usec = s->watchdog_usec; diff --git a/src/core/socket.c b/src/core/socket.c index 6ba8338d8..39652ef56 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1414,6 +1414,7 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn; exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported; exec_params.cgroup_path = UNIT(s)->cgroup_path; + exec_params.cgroup_delegate = s->cgroup_context.delegate; exec_params.runtime_prefix = manager_get_runtime_prefix(UNIT(s)->manager); exec_params.unit_id = UNIT(s)->id; diff --git a/src/core/swap.c b/src/core/swap.c index 1add722bf..0a1cc8093 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -627,6 +627,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn; exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported; exec_params.cgroup_path = UNIT(s)->cgroup_path; + exec_params.cgroup_delegate = s->cgroup_context.delegate; exec_params.runtime_prefix = manager_get_runtime_prefix(UNIT(s)->manager); exec_params.unit_id = UNIT(s)->id; diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c index 3c7d4be8d..7f8c631ef 100644 --- a/src/machine/machined-dbus.c +++ b/src/machine/machined-dbus.c @@ -622,6 +622,10 @@ int manager_start_scope( if (r < 0) return r; + r = sd_bus_message_append(m, "(sv)", "Delegate", "b", 1); + if (r < 0) + return r; + if (more_properties) { r = sd_bus_message_copy(m, more_properties, true); if (r < 0) diff --git a/src/shared/cgroup-util.h b/src/shared/cgroup-util.h index aca4e44c4..a65f515b4 100644 --- a/src/shared/cgroup-util.h +++ b/src/shared/cgroup-util.h @@ -34,7 +34,8 @@ typedef enum CGroupControllerMask { CGROUP_CPUACCT = 2, CGROUP_BLKIO = 4, CGROUP_MEMORY = 8, - CGROUP_DEVICE = 16 + CGROUP_DEVICE = 16, + _CGROUP_CONTROLLER_MASK_ALL = 31 } CGroupControllerMask; /* diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in index 574d0deaf..dec2ce7df 100644 --- a/units/systemd-nspawn@.service.in +++ b/units/systemd-nspawn@.service.in @@ -15,6 +15,7 @@ KillMode=mixed Type=notify RestartForceExitStatus=133 SuccessExitStatus=133 +Delegate=yes [Install] WantedBy=multi-user.target diff --git a/units/user@.service.in b/units/user@.service.in index 8091ce1a0..1e21d51aa 100644 --- a/units/user@.service.in +++ b/units/user@.service.in @@ -16,3 +16,4 @@ Type=notify ExecStart=-@rootlibexecdir@/systemd --user Slice=user-%i.slice KillMode=mixed +Delegate=yes -- 2.30.2