chiark / gitweb /
core: introduce new Delegate=yes/no property controlling creation of cgroup subhierar...
authorLennart Poettering <lennart@poettering.net>
Wed, 5 Nov 2014 16:57:23 +0000 (17:57 +0100)
committerLennart Poettering <lennart@poettering.net>
Wed, 5 Nov 2014 17:49:14 +0000 (18:49 +0100)
For priviliged units this resource control property ensures that the
processes have all controllers systemd manages enabled.

For unpriviliged services (those with User= set) this ensures that
access rights to the service cgroup is granted to the user in question,
to create further subgroups. Note that this only applies to the
name=systemd hierarchy though, as access to other controllers is not
safe for unpriviliged processes.

Delegate=yes should be set for container scopes where a systemd instance
inside the container shall manage the hierarchies below its own cgroup
and have access to all controllers.

Delegate=yes should also be set for user@.service, so that systemd
--user can run, controlling its own cgroup tree.

This commit changes machined, systemd-nspawn@.service and user@.service
to set this boolean, in order to ensure that container management will
just work, and the user systemd instance can run fine.

15 files changed:
man/systemd.resource-control.xml
src/core/cgroup.c
src/core/cgroup.h
src/core/dbus-cgroup.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4
src/core/mount.c
src/core/service.c
src/core/socket.c
src/core/swap.c
src/machine/machined-dbus.c
src/shared/cgroup-util.h
units/systemd-nspawn@.service.in
units/user@.service.in

index 968b328..218946d 100644 (file)
@@ -394,6 +394,20 @@ along with systemd; If not, see <http://www.gnu.org/licenses/>.
         </listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>Delegate=</varname></term>
+
+        <listitem>
+          <para>Turns on delegation of further resource control
+          partitioning to processes of the unit. For unpriviliged
+          services (i.e. those using the <varname>User=</varname>
+          setting) this allows processes to create a subhierarchy
+          beneath its control group path. For priviliged services and
+          scopes this ensures the processes will have all control
+          group controllers enabled.</para>
+        </listitem>
+      </varlistentry>
+
     </variablelist>
   </refsect1>
 
index e604c3c..0951a09 100644 (file)
@@ -105,7 +105,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                 "%sBlockIOWeight=%lu\n"
                 "%sStartupBlockIOWeight=%lu\n"
                 "%sMemoryLimit=%" PRIu64 "\n"
-                "%sDevicePolicy=%s\n",
+                "%sDevicePolicy=%s\n"
+                "%sDelegate=%s\n",
                 prefix, yes_no(c->cpu_accounting),
                 prefix, yes_no(c->blockio_accounting),
                 prefix, yes_no(c->memory_accounting),
@@ -115,7 +116,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                 prefix, c->blockio_weight,
                 prefix, c->startup_blockio_weight,
                 prefix, c->memory_limit,
-                prefix, cgroup_device_policy_to_string(c->device_policy));
+                prefix, cgroup_device_policy_to_string(c->device_policy),
+                prefix, yes_no(c->delegate));
 
         LIST_FOREACH(device_allow, a, c->device_allow)
                 fprintf(f,
@@ -461,7 +463,8 @@ CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
             c->memory_limit != (uint64_t) -1)
                 mask |= CGROUP_MEMORY;
 
-        if (c->device_allow || c->device_policy != CGROUP_AUTO)
+        if (c->device_allow ||
+            c->device_policy != CGROUP_AUTO)
                 mask |= CGROUP_DEVICE;
 
         return mask;
@@ -474,6 +477,19 @@ CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
         if (!c)
                 return 0;
 
+        /* If delegation is turned on, then turn on all cgroups,
+         * unless the process we fork into it is known to drop
+         * privileges anyway, and shouldn't get access to the
+         * controllers anyway. */
+
+        if (c->delegate) {
+                ExecContext *e;
+
+                e = unit_get_exec_context(u);
+                if (!e || exec_context_maintains_privileges(e))
+                        return _CGROUP_CONTROLLER_MASK_ALL;
+        }
+
         return cgroup_context_get_mask(c);
 }
 
index d299872..3c43885 100644 (file)
@@ -83,6 +83,8 @@ struct CGroupContext {
 
         CGroupDevicePolicy device_policy;
         LIST_HEAD(CGroupDeviceAllow, device_allow);
+
+        bool delegate;
 };
 
 #include "unit.h"
index 900566c..db99834 100644 (file)
@@ -153,6 +153,7 @@ static int property_get_ulong_as_u64(
 
 const sd_bus_vtable bus_cgroup_vtable[] = {
         SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0),
         SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0),
         SD_BUS_PROPERTY("CPUShares", "t", property_get_ulong_as_u64, offsetof(CGroupContext, cpu_shares), 0),
         SD_BUS_PROPERTY("StartupCPUShares", "t", property_get_ulong_as_u64, offsetof(CGroupContext, startup_cpu_shares), 0),
@@ -170,6 +171,39 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
         SD_BUS_VTABLE_END
 };
 
+static int bus_cgroup_set_transient_property(
+                Unit *u,
+                CGroupContext *c,
+                const char *name,
+                sd_bus_message *message,
+                UnitSetPropertiesMode mode,
+                sd_bus_error *error) {
+
+        int r;
+
+        assert(u);
+        assert(c);
+        assert(name);
+        assert(message);
+
+        if (streq(name, "Delegate")) {
+                int b;
+
+                r = sd_bus_message_read(message, "b", &b);
+                if (r < 0)
+                        return r;
+
+                if (mode != UNIT_CHECK) {
+                        c->delegate = b;
+                        unit_write_drop_in_private(u, mode, name, b ? "Delegate=yes" : "Delegate=no");
+                }
+
+                return 1;
+        }
+
+        return 0;
+}
+
 int bus_cgroup_set_property(
                 Unit *u,
                 CGroupContext *c,
@@ -632,6 +666,14 @@ int bus_cgroup_set_property(
                 }
 
                 return 1;
+
+        }
+
+        if (u->transient && u->load_state == UNIT_STUB) {
+                r = bus_cgroup_set_transient_property(u, c, name, message, mode, error);
+                if (r != 0)
+                        return r;
+
         }
 
         return 0;
index c41aec2..5cfd4a1 100644 (file)
@@ -1444,8 +1444,10 @@ static int exec_child(ExecCommand *command,
         }
 #endif
 
-#ifdef HAVE_PAM
-        if (params->cgroup_path && context->user && context->pam_name) {
+        /* If delegation is enabled we'll pass ownership of the cgroup
+         * (but only in systemd's own controller hierarchy!) to the
+         * user of the new process. */
+        if (params->cgroup_path && context->user && params->cgroup_delegate) {
                 err = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
                 if (err < 0) {
                         *error = EXIT_CGROUP;
@@ -1459,7 +1461,6 @@ static int exec_child(ExecCommand *command,
                         return err;
                 }
         }
-#endif
 
         if (!strv_isempty(context->runtime_directory) && params->runtime_prefix) {
                 char **rt;
@@ -2402,6 +2403,21 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
 }
 
+bool exec_context_maintains_privileges(ExecContext *c) {
+        assert(c);
+
+        /* Returns true if the process forked off would run run under
+         * an unchanged UID or as root. */
+
+        if (!c->user)
+                return true;
+
+        if (streq(c->user, "root") || streq(c->user, "0"))
+                return true;
+
+        return false;
+}
+
 void exec_status_start(ExecStatus *s, pid_t pid) {
         assert(s);
 
index c45dde5..b16a24d 100644 (file)
@@ -207,6 +207,7 @@ struct ExecParameters {
         bool selinux_context_net;
         CGroupControllerMask cgroup_supported;
         const char *cgroup_path;
+        bool cgroup_delegate;
         const char *runtime_prefix;
         const char *unit_id;
         usec_t watchdog_usec;
@@ -244,6 +245,7 @@ int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_r
 int exec_context_load_environment(const ExecContext *c, const char *unit_id, char ***l);
 
 bool exec_context_may_touch_console(ExecContext *c);
+bool exec_context_maintains_privileges(ExecContext *c);
 
 void exec_status_start(ExecStatus *s, pid_t pid);
 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status);
index ca01394..5158a9f 100644 (file)
@@ -119,7 +119,8 @@ $1.BlockIOWeight,                config_parse_blockio_weight,        0,
 $1.StartupBlockIOWeight,         config_parse_blockio_weight,        0,                             offsetof($1, cgroup_context.startup_blockio_weight)
 $1.BlockIODeviceWeight,          config_parse_blockio_device_weight, 0,                             offsetof($1, cgroup_context)
 $1.BlockIOReadBandwidth,         config_parse_blockio_bandwidth,     0,                             offsetof($1, cgroup_context)
-$1.BlockIOWriteBandwidth,        config_parse_blockio_bandwidth,     0,                             offsetof($1, cgroup_context)'
+$1.BlockIOWriteBandwidth,        config_parse_blockio_bandwidth,     0,                             offsetof($1, cgroup_context)
+$1.Delegate,                     config_parse_bool,                  0,                             offsetof($1, cgroup_context.delegate)'
 )m4_dnl
 Unit.Description,                config_parse_unit_string_printf,    0,                             offsetof(Unit, description)
 Unit.Documentation,              config_parse_documentation,         0,                             offsetof(Unit, documentation)
index 01243c3..8b787f6 100644 (file)
@@ -715,6 +715,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) {
         exec_params.confirm_spawn = UNIT(m)->manager->confirm_spawn;
         exec_params.cgroup_supported = UNIT(m)->manager->cgroup_supported;
         exec_params.cgroup_path = UNIT(m)->cgroup_path;
+        exec_params.cgroup_delegate = m->cgroup_context.delegate;
         exec_params.runtime_prefix = manager_get_runtime_prefix(UNIT(m)->manager);
         exec_params.unit_id = UNIT(m)->id;
 
index f27e63e..6a27e8f 100644 (file)
@@ -1000,6 +1000,7 @@ static int service_spawn(
         exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn;
         exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported;
         exec_params.cgroup_path = path;
+        exec_params.cgroup_delegate = s->cgroup_context.delegate;
         exec_params.runtime_prefix = manager_get_runtime_prefix(UNIT(s)->manager);
         exec_params.unit_id = UNIT(s)->id;
         exec_params.watchdog_usec = s->watchdog_usec;
index 6ba8338..39652ef 100644 (file)
@@ -1414,6 +1414,7 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) {
         exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn;
         exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported;
         exec_params.cgroup_path = UNIT(s)->cgroup_path;
+        exec_params.cgroup_delegate = s->cgroup_context.delegate;
         exec_params.runtime_prefix = manager_get_runtime_prefix(UNIT(s)->manager);
         exec_params.unit_id = UNIT(s)->id;
 
index 1add722..0a1cc80 100644 (file)
@@ -627,6 +627,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) {
         exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn;
         exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported;
         exec_params.cgroup_path = UNIT(s)->cgroup_path;
+        exec_params.cgroup_delegate = s->cgroup_context.delegate;
         exec_params.runtime_prefix = manager_get_runtime_prefix(UNIT(s)->manager);
         exec_params.unit_id = UNIT(s)->id;
 
index 3c7d4be..7f8c631 100644 (file)
@@ -622,6 +622,10 @@ int manager_start_scope(
         if (r < 0)
                 return r;
 
+        r = sd_bus_message_append(m, "(sv)", "Delegate", "b", 1);
+        if (r < 0)
+                return r;
+
         if (more_properties) {
                 r = sd_bus_message_copy(m, more_properties, true);
                 if (r < 0)
index aca4e44..a65f515 100644 (file)
@@ -34,7 +34,8 @@ typedef enum CGroupControllerMask {
         CGROUP_CPUACCT = 2,
         CGROUP_BLKIO = 4,
         CGROUP_MEMORY = 8,
-        CGROUP_DEVICE = 16
+        CGROUP_DEVICE = 16,
+        _CGROUP_CONTROLLER_MASK_ALL = 31
 } CGroupControllerMask;
 
 /*
index 574d0de..dec2ce7 100644 (file)
@@ -15,6 +15,7 @@ KillMode=mixed
 Type=notify
 RestartForceExitStatus=133
 SuccessExitStatus=133
+Delegate=yes
 
 [Install]
 WantedBy=multi-user.target
index 8091ce1..1e21d51 100644 (file)
@@ -16,3 +16,4 @@ Type=notify
 ExecStart=-@rootlibexecdir@/systemd --user
 Slice=user-%i.slice
 KillMode=mixed
+Delegate=yes