1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include "alloc-util.h"
26 #include "cgroup-util.h"
31 #include "parse-util.h"
32 #include "path-util.h"
33 #include "process-util.h"
34 //#include "special.h"
35 #include "string-table.h"
36 #include "string-util.h"
38 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
40 /// UNNEEDED by elogind
42 void cgroup_context_init(CGroupContext *c) {
45 /* Initialize everything to the kernel defaults, assuming the
46 * structure is preinitialized to 0 */
48 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
49 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
50 c->cpu_quota_per_sec_usec = USEC_INFINITY;
52 c->memory_limit = (uint64_t) -1;
54 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
55 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
57 c->tasks_max = (uint64_t) -1;
59 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
62 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
66 LIST_REMOVE(device_allow, c->device_allow, a);
71 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
75 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
80 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
84 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
89 void cgroup_context_done(CGroupContext *c) {
92 while (c->blockio_device_weights)
93 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
95 while (c->blockio_device_bandwidths)
96 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
98 while (c->device_allow)
99 cgroup_context_free_device_allow(c, c->device_allow);
102 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
103 CGroupBlockIODeviceBandwidth *b;
104 CGroupBlockIODeviceWeight *w;
105 CGroupDeviceAllow *a;
106 char u[FORMAT_TIMESPAN_MAX];
111 prefix = strempty(prefix);
114 "%sCPUAccounting=%s\n"
115 "%sBlockIOAccounting=%s\n"
116 "%sMemoryAccounting=%s\n"
117 "%sTasksAccounting=%s\n"
118 "%sCPUShares=%" PRIu64 "\n"
119 "%sStartupCPUShares=%" PRIu64 "\n"
120 "%sCPUQuotaPerSecSec=%s\n"
121 "%sBlockIOWeight=%" PRIu64 "\n"
122 "%sStartupBlockIOWeight=%" PRIu64 "\n"
123 "%sMemoryLimit=%" PRIu64 "\n"
124 "%sTasksMax=%" PRIu64 "\n"
125 "%sDevicePolicy=%s\n"
127 prefix, yes_no(c->cpu_accounting),
128 prefix, yes_no(c->blockio_accounting),
129 prefix, yes_no(c->memory_accounting),
130 prefix, yes_no(c->tasks_accounting),
131 prefix, c->cpu_shares,
132 prefix, c->startup_cpu_shares,
133 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
134 prefix, c->blockio_weight,
135 prefix, c->startup_blockio_weight,
136 prefix, c->memory_limit,
137 prefix, c->tasks_max,
138 prefix, cgroup_device_policy_to_string(c->device_policy),
139 prefix, yes_no(c->delegate));
141 LIST_FOREACH(device_allow, a, c->device_allow)
143 "%sDeviceAllow=%s %s%s%s\n",
146 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
148 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
150 "%sBlockIODeviceWeight=%s %" PRIu64,
155 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
156 char buf[FORMAT_BYTES_MAX];
161 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
163 format_bytes(buf, sizeof(buf), b->bandwidth));
167 static int lookup_blkio_device(const char *p, dev_t *dev) {
176 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
178 if (S_ISBLK(st.st_mode))
180 else if (major(st.st_dev) != 0) {
181 /* If this is not a device node then find the block
182 * device this file is stored on */
185 /* If this is a partition, try to get the originating
187 block_get_whole_disk(*dev, dev);
189 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
196 static int whitelist_device(const char *path, const char *node, const char *acc) {
197 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
204 if (stat(node, &st) < 0) {
205 log_warning("Couldn't stat device %s", node);
209 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
210 log_warning("%s is not a device.", node);
216 S_ISCHR(st.st_mode) ? 'c' : 'b',
217 major(st.st_rdev), minor(st.st_rdev),
220 r = cg_set_attribute("devices", path, "devices.allow", buf);
222 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
223 "Failed to set devices.allow on %s: %m", path);
228 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
229 _cleanup_fclose_ FILE *f = NULL;
236 assert(type == 'b' || type == 'c');
238 f = fopen("/proc/devices", "re");
240 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
242 FOREACH_LINE(line, f, goto fail) {
243 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
248 if (type == 'c' && streq(line, "Character devices:")) {
253 if (type == 'b' && streq(line, "Block devices:")) {
268 w = strpbrk(p, WHITESPACE);
273 r = safe_atou(p, &maj);
280 w += strspn(w, WHITESPACE);
282 if (fnmatch(name, w, 0) != 0)
291 r = cg_set_attribute("devices", path, "devices.allow", buf);
293 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
294 "Failed to set devices.allow on %s: %m", path);
300 log_warning_errno(errno, "Failed to read /proc/devices: %m");
304 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
314 /* Some cgroup attributes are not supported on the root cgroup,
315 * hence silently ignore */
316 is_root = isempty(path) || path_equal(path, "/");
318 /* Make sure we don't try to display messages with an empty path. */
321 /* We generally ignore errors caused by read-only mounted
322 * cgroup trees (assuming we are running in a container then),
323 * and missing cgroups, i.e. EROFS and ENOENT. */
325 if ((mask & CGROUP_MASK_CPU) && !is_root) {
326 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
328 sprintf(buf, "%" PRIu64 "\n",
329 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
330 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
331 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
333 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
334 "Failed to set cpu.shares on %s: %m", path);
336 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
337 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
339 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
340 "Failed to set cpu.cfs_period_us on %s: %m", path);
342 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
343 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
344 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
346 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
348 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
349 "Failed to set cpu.cfs_quota_us on %s: %m", path);
352 if (mask & CGROUP_MASK_BLKIO) {
353 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
354 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
355 CGroupBlockIODeviceWeight *w;
356 CGroupBlockIODeviceBandwidth *b;
359 sprintf(buf, "%" PRIu64 "\n",
360 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
361 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
362 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
364 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
365 "Failed to set blkio.weight on %s: %m", path);
367 /* FIXME: no way to reset this list */
368 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
371 r = lookup_blkio_device(w->path, &dev);
375 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
376 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
378 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
379 "Failed to set blkio.weight_device on %s: %m", path);
383 /* FIXME: no way to reset this list */
384 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
388 r = lookup_blkio_device(b->path, &dev);
392 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
394 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
395 r = cg_set_attribute("blkio", path, a, buf);
397 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
398 "Failed to set %s on %s: %m", a, path);
402 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
403 if (c->memory_limit != (uint64_t) -1) {
404 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
406 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
408 if (cg_unified() <= 0)
409 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
411 r = cg_set_attribute("memory", path, "memory.max", buf);
414 if (cg_unified() <= 0)
415 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
417 r = cg_set_attribute("memory", path, "memory.max", "max");
421 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
422 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
425 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
426 CGroupDeviceAllow *a;
428 /* Changing the devices list of a populated cgroup
429 * might result in EINVAL, hence ignore EINVAL
432 if (c->device_allow || c->device_policy != CGROUP_AUTO)
433 r = cg_set_attribute("devices", path, "devices.deny", "a");
435 r = cg_set_attribute("devices", path, "devices.allow", "a");
437 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
438 "Failed to reset devices.list on %s: %m", path);
440 if (c->device_policy == CGROUP_CLOSED ||
441 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
442 static const char auto_devices[] =
443 "/dev/null\0" "rwm\0"
444 "/dev/zero\0" "rwm\0"
445 "/dev/full\0" "rwm\0"
446 "/dev/random\0" "rwm\0"
447 "/dev/urandom\0" "rwm\0"
449 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
453 NULSTR_FOREACH_PAIR(x, y, auto_devices)
454 whitelist_device(path, x, y);
456 whitelist_major(path, "pts", 'c', "rw");
457 whitelist_major(path, "kdbus", 'c', "rw");
458 whitelist_major(path, "kdbus/*", 'c', "rw");
461 LIST_FOREACH(device_allow, a, c->device_allow) {
477 if (startswith(a->path, "/dev/"))
478 whitelist_device(path, a->path, acc);
479 else if (startswith(a->path, "block-"))
480 whitelist_major(path, a->path + 6, 'b', acc);
481 else if (startswith(a->path, "char-"))
482 whitelist_major(path, a->path + 5, 'c', acc);
484 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
488 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
490 if (c->tasks_max != (uint64_t) -1) {
491 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
493 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
494 r = cg_set_attribute("pids", path, "pids.max", buf);
496 r = cg_set_attribute("pids", path, "pids.max", "max");
499 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
500 "Failed to set pids.max on %s: %m", path);
503 if (mask & CGROUP_MASK_NET_CLS) {
504 char buf[DECIMAL_STR_MAX(uint32_t)];
506 sprintf(buf, "%" PRIu32, netclass);
508 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
510 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
511 "Failed to set net_cls.classid on %s: %m", path);
515 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
518 /* Figure out which controllers we need */
520 if (c->cpu_accounting ||
521 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
522 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
523 c->cpu_quota_per_sec_usec != USEC_INFINITY)
524 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
526 if (c->blockio_accounting ||
527 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
528 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
529 c->blockio_device_weights ||
530 c->blockio_device_bandwidths)
531 mask |= CGROUP_MASK_BLKIO;
533 if (c->memory_accounting ||
534 c->memory_limit != (uint64_t) -1)
535 mask |= CGROUP_MASK_MEMORY;
537 if (c->device_allow ||
538 c->device_policy != CGROUP_AUTO)
539 mask |= CGROUP_MASK_DEVICES;
541 if (c->tasks_accounting ||
542 c->tasks_max != (uint64_t) -1)
543 mask |= CGROUP_MASK_PIDS;
545 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
546 mask |= CGROUP_MASK_NET_CLS;
551 CGroupMask unit_get_own_mask(Unit *u) {
554 /* Returns the mask of controllers the unit needs for itself */
556 c = unit_get_cgroup_context(u);
560 /* If delegation is turned on, then turn on all cgroups,
561 * unless we are on the legacy hierarchy and the process we
562 * fork into it is known to drop privileges, and hence
563 * shouldn't get access to the controllers.
565 * Note that on the unified hierarchy it is safe to delegate
566 * controllers to unprivileged services. */
571 e = unit_get_exec_context(u);
573 exec_context_maintains_privileges(e) ||
575 return _CGROUP_MASK_ALL;
578 return cgroup_context_get_mask(c);
581 CGroupMask unit_get_members_mask(Unit *u) {
584 /* Returns the mask of controllers all of the unit's children
587 if (u->cgroup_members_mask_valid)
588 return u->cgroup_members_mask;
590 u->cgroup_members_mask = 0;
592 if (u->type == UNIT_SLICE) {
596 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
601 if (UNIT_DEREF(member->slice) != u)
604 u->cgroup_members_mask |=
605 unit_get_own_mask(member) |
606 unit_get_members_mask(member);
610 u->cgroup_members_mask_valid = true;
611 return u->cgroup_members_mask;
614 CGroupMask unit_get_siblings_mask(Unit *u) {
617 /* Returns the mask of controllers all of the unit's siblings
618 * require, i.e. the members mask of the unit's parent slice
619 * if there is one. */
621 if (UNIT_ISSET(u->slice))
622 return unit_get_members_mask(UNIT_DEREF(u->slice));
624 return unit_get_own_mask(u) | unit_get_members_mask(u);
627 CGroupMask unit_get_subtree_mask(Unit *u) {
629 /* Returns the mask of this subtree, meaning of the group
630 * itself and its children. */
632 return unit_get_own_mask(u) | unit_get_members_mask(u);
635 CGroupMask unit_get_target_mask(Unit *u) {
638 /* This returns the cgroup mask of all controllers to enable
639 * for a specific cgroup, i.e. everything it needs itself,
640 * plus all that its children need, plus all that its siblings
641 * need. This is primarily useful on the legacy cgroup
642 * hierarchy, where we need to duplicate each cgroup in each
643 * hierarchy that shall be enabled for it. */
645 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
646 mask &= u->manager->cgroup_supported;
651 CGroupMask unit_get_enable_mask(Unit *u) {
654 /* This returns the cgroup mask of all controllers to enable
655 * for the children of a specific cgroup. This is primarily
656 * useful for the unified cgroup hierarchy, where each cgroup
657 * controls which controllers are enabled for its children. */
659 mask = unit_get_members_mask(u);
660 mask &= u->manager->cgroup_supported;
665 /* Recurse from a unit up through its containing slices, propagating
666 * mask bits upward. A unit is also member of itself. */
667 void unit_update_cgroup_members_masks(Unit *u) {
673 /* Calculate subtree mask */
674 m = unit_get_subtree_mask(u);
676 /* See if anything changed from the previous invocation. If
677 * not, we're done. */
678 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
682 u->cgroup_subtree_mask_valid &&
683 ((m & ~u->cgroup_subtree_mask) != 0) &&
684 ((~m & u->cgroup_subtree_mask) == 0);
686 u->cgroup_subtree_mask = m;
687 u->cgroup_subtree_mask_valid = true;
689 if (UNIT_ISSET(u->slice)) {
690 Unit *s = UNIT_DEREF(u->slice);
693 /* There's more set now than before. We
694 * propagate the new mask to the parent's mask
695 * (not caring if it actually was valid or
698 s->cgroup_members_mask |= m;
701 /* There's less set now than before (or we
702 * don't know), we need to recalculate
703 * everything, so let's invalidate the
704 * parent's members mask */
706 s->cgroup_members_mask_valid = false;
708 /* And now make sure that this change also hits our
710 unit_update_cgroup_members_masks(s);
714 static const char *migrate_callback(CGroupMask mask, void *userdata) {
721 if (u->cgroup_path &&
722 u->cgroup_realized &&
723 (u->cgroup_realized_mask & mask) == mask)
724 return u->cgroup_path;
726 u = UNIT_DEREF(u->slice);
732 char *unit_default_cgroup_path(Unit *u) {
733 _cleanup_free_ char *escaped = NULL, *slice = NULL;
738 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
739 return strdup(u->manager->cgroup_root);
741 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
742 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
747 escaped = cg_escape(u->id);
752 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
754 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
757 int unit_set_cgroup_path(Unit *u, const char *path) {
758 _cleanup_free_ char *p = NULL;
770 if (streq_ptr(u->cgroup_path, p))
774 r = hashmap_put(u->manager->cgroup_unit, p, u);
779 unit_release_cgroup(u);
787 int unit_watch_cgroup(Unit *u) {
788 _cleanup_free_ char *populated = NULL;
796 if (u->cgroup_inotify_wd >= 0)
799 /* Only applies to the unified hierarchy */
802 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
806 /* Don't watch the root slice, it's pointless. */
807 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
810 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
814 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
818 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
819 if (u->cgroup_inotify_wd < 0) {
821 if (errno == ENOENT) /* If the directory is already
822 * gone we don't need to track
823 * it, so this is not an error */
826 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
829 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
831 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
836 static int unit_create_cgroup(
838 CGroupMask target_mask,
839 CGroupMask enable_mask) {
846 c = unit_get_cgroup_context(u);
850 if (!u->cgroup_path) {
851 _cleanup_free_ char *path = NULL;
853 path = unit_default_cgroup_path(u);
857 r = unit_set_cgroup_path(u, path);
859 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
861 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
864 /* First, create our own group */
865 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
867 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
869 /* Start watching it */
870 (void) unit_watch_cgroup(u);
872 /* Enable all controllers we need */
873 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
875 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
877 /* Keep track that this is now realized */
878 u->cgroup_realized = true;
879 u->cgroup_realized_mask = target_mask;
881 if (u->type != UNIT_SLICE && !c->delegate) {
883 /* Then, possibly move things over, but not if
884 * subgroups may contain processes, which is the case
885 * for slice and delegation units. */
886 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
888 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
894 int unit_attach_pids_to_cgroup(Unit *u) {
898 r = unit_realize_cgroup(u);
902 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
909 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
912 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
915 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
924 i = start = m->cgroup_netclass_registry_last;
929 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
930 m->cgroup_netclass_registry_last = i;
936 i = CGROUP_NETCLASS_FIXED_MAX;
938 } while (i != start);
943 int unit_add_to_netclass_cgroup(Unit *u) {
952 cc = unit_get_cgroup_context(u);
956 switch (cc->netclass_type) {
957 case CGROUP_NETCLASS_TYPE_NONE:
960 case CGROUP_NETCLASS_TYPE_FIXED:
961 u->cgroup_netclass_id = cc->netclass_id;
964 case CGROUP_NETCLASS_TYPE_AUTO:
965 /* Allocate a new ID in case it was requested and not done yet */
966 if (u->cgroup_netclass_id == 0) {
967 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
971 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
977 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
981 key = UINT32_TO_PTR(u->cgroup_netclass_id);
982 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
985 LIST_PREPEND(cgroup_netclass, first, u);
986 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
989 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
992 int unit_remove_from_netclass_cgroup(Unit *u) {
999 key = UINT32_TO_PTR(u->cgroup_netclass_id);
1001 LIST_FIND_HEAD(cgroup_netclass, u, head);
1002 LIST_REMOVE(cgroup_netclass, head, u);
1005 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1007 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1012 /* Check if necessary controllers and attributes for a unit are in place.
1014 * If so, do nothing.
1015 * If not, create paths, move processes over, and set attributes.
1017 * Returns 0 on success and < 0 on failure. */
1018 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1019 CGroupMask target_mask, enable_mask;
1024 if (u->in_cgroup_queue) {
1025 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1026 u->in_cgroup_queue = false;
1029 target_mask = unit_get_target_mask(u);
1030 if (unit_has_mask_realized(u, target_mask))
1033 /* First, realize parents */
1034 if (UNIT_ISSET(u->slice)) {
1035 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1040 /* And then do the real work */
1041 enable_mask = unit_get_enable_mask(u);
1042 r = unit_create_cgroup(u, target_mask, enable_mask);
1046 /* Finally, apply the necessary attributes. */
1047 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1052 static void unit_add_to_cgroup_queue(Unit *u) {
1054 if (u->in_cgroup_queue)
1057 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1058 u->in_cgroup_queue = true;
1061 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1067 state = manager_state(m);
1069 while ((i = m->cgroup_queue)) {
1070 assert(i->in_cgroup_queue);
1072 r = unit_realize_cgroup_now(i, state);
1074 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1082 static void unit_queue_siblings(Unit *u) {
1085 /* This adds the siblings of the specified unit and the
1086 * siblings of all parent units to the cgroup queue. (But
1087 * neither the specified unit itself nor the parents.) */
1089 while ((slice = UNIT_DEREF(u->slice))) {
1093 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1097 /* Skip units that have a dependency on the slice
1098 * but aren't actually in it. */
1099 if (UNIT_DEREF(m->slice) != slice)
1102 /* No point in doing cgroup application for units
1103 * without active processes. */
1104 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1107 /* If the unit doesn't need any new controllers
1108 * and has current ones realized, it doesn't need
1110 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1113 unit_add_to_cgroup_queue(m);
1120 int unit_realize_cgroup(Unit *u) {
1123 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1126 /* So, here's the deal: when realizing the cgroups for this
1127 * unit, we need to first create all parents, but there's more
1128 * actually: for the weight-based controllers we also need to
1129 * make sure that all our siblings (i.e. units that are in the
1130 * same slice as we are) have cgroups, too. Otherwise, things
1131 * would become very uneven as each of their processes would
1132 * get as much resources as all our group together. This call
1133 * will synchronously create the parent cgroups, but will
1134 * defer work on the siblings to the next event loop
1137 /* Add all sibling slices to the cgroup queue. */
1138 unit_queue_siblings(u);
1140 /* And realize this one now (and apply the values) */
1141 return unit_realize_cgroup_now(u, manager_state(u->manager));
1144 void unit_release_cgroup(Unit *u) {
1147 /* Forgets all cgroup details for this cgroup */
1149 if (u->cgroup_path) {
1150 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1151 u->cgroup_path = mfree(u->cgroup_path);
1154 if (u->cgroup_inotify_wd >= 0) {
1155 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1156 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1158 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1159 u->cgroup_inotify_wd = -1;
1163 void unit_prune_cgroup(Unit *u) {
1169 /* Removes the cgroup, if empty and possible, and stops watching it. */
1171 if (!u->cgroup_path)
1174 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1176 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1178 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1185 unit_release_cgroup(u);
1187 u->cgroup_realized = false;
1188 u->cgroup_realized_mask = 0;
1191 int unit_search_main_pid(Unit *u, pid_t *ret) {
1192 _cleanup_fclose_ FILE *f = NULL;
1193 pid_t pid = 0, npid, mypid;
1199 if (!u->cgroup_path)
1202 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1207 while (cg_read_pid(f, &npid) > 0) {
1213 /* Ignore processes that aren't our kids */
1214 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1218 /* Dang, there's more than one daemonized PID
1219 in this group, so we don't know what process
1220 is the main process. */
1231 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1232 _cleanup_closedir_ DIR *d = NULL;
1233 _cleanup_fclose_ FILE *f = NULL;
1239 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1245 while ((r = cg_read_pid(f, &pid)) > 0) {
1246 r = unit_watch_pid(u, pid);
1247 if (r < 0 && ret >= 0)
1251 if (r < 0 && ret >= 0)
1255 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1262 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1263 _cleanup_free_ char *p = NULL;
1265 p = strjoin(path, "/", fn, NULL);
1271 r = unit_watch_pids_in_path(u, p);
1272 if (r < 0 && ret >= 0)
1276 if (r < 0 && ret >= 0)
1283 int unit_watch_all_pids(Unit *u) {
1286 /* Adds all PIDs from our cgroup to the set of PIDs we
1287 * watch. This is a fallback logic for cases where we do not
1288 * get reliable cgroup empty notifications: we try to use
1289 * SIGCHLD as replacement. */
1291 if (!u->cgroup_path)
1294 if (cg_unified() > 0) /* On unified we can use proper notifications */
1297 return unit_watch_pids_in_path(u, u->cgroup_path);
1300 int unit_notify_cgroup_empty(Unit *u) {
1305 if (!u->cgroup_path)
1308 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1312 unit_add_to_gc_queue(u);
1314 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1315 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1320 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1321 Manager *m = userdata;
1328 union inotify_event_buffer buffer;
1329 struct inotify_event *e;
1332 l = read(fd, &buffer, sizeof(buffer));
1334 if (errno == EINTR || errno == EAGAIN)
1337 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1340 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1344 /* Queue overflow has no watch descriptor */
1347 if (e->mask & IN_IGNORED)
1348 /* The watch was just removed */
1351 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1352 if (!u) /* Not that inotify might deliver
1353 * events for a watch even after it
1354 * was removed, because it was queued
1355 * before the removal. Let's ignore
1356 * this here safely. */
1359 (void) unit_notify_cgroup_empty(u);
1365 int manager_setup_cgroup(Manager *m) {
1366 _cleanup_free_ char *path = NULL;
1373 /* 1. Determine hierarchy */
1374 m->cgroup_root = mfree(m->cgroup_root);
1375 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1377 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1379 /// elogind does not support systemd scopes and slices
1381 /* Chop off the init scope, if we are already located in it */
1382 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1384 /* LEGACY: Also chop off the system slice if we are in
1385 * it. This is to support live upgrades from older systemd
1386 * versions where PID 1 was moved there. Also see
1387 * cg_get_root_path(). */
1388 if (!e && m->running_as == MANAGER_SYSTEM) {
1389 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1391 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1397 /* And make sure to store away the root value without trailing
1398 * slash, even for the root dir, so that we can easily prepend
1400 while ((e = endswith(m->cgroup_root, "/")))
1402 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1403 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1406 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1408 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1410 unified = cg_unified();
1412 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1414 log_debug("Unified cgroup hierarchy is located at %s.", path);
1416 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1419 const char *scope_path;
1421 /* 3. Install agent */
1424 /* In the unified hierarchy we can can get
1425 * cgroup empty notifications via inotify. */
1427 /// elogind does not support the unified hierarchy, yet.
1429 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1430 safe_close(m->cgroup_inotify_fd);
1432 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1433 if (m->cgroup_inotify_fd < 0)
1434 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1436 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1438 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1440 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1442 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1444 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1447 return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1449 } else if (m->running_as == MANAGER_SYSTEM) {
1451 /* On the legacy hierarchy we only get
1452 * notifications via cgroup agents. (Which
1453 * isn't really reliable, since it does not
1454 * generate events when control groups with
1455 * children run empty. */
1457 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1459 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1461 log_debug("Installed release agent.");
1463 log_debug("Release agent already installed.");
1466 /// elogind is not meant to run in systemd init scope
1468 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1469 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1470 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1472 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1473 // we are our own cgroup controller
1474 scope_path = strjoina("");
1475 else if (streq(m->cgroup_root, "/elogind"))
1476 // root already is our cgroup
1477 scope_path = strjoina(m->cgroup_root);
1479 // we have to create our own group
1480 scope_path = strjoina(m->cgroup_root, "/elogind");
1481 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1484 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1485 log_debug_elogind("Created control group \"%s\"", scope_path);
1487 /* also, move all other userspace processes remaining
1488 * in the root cgroup into that scope. */
1489 if (!streq(m->cgroup_root, scope_path)) {
1490 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1492 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1495 /* 5. And pin it, so that it cannot be unmounted */
1496 safe_close(m->pin_cgroupfs_fd);
1497 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1498 if (m->pin_cgroupfs_fd < 0)
1499 return log_error_errno(errno, "Failed to open pin file: %m");
1501 /* 6. Always enable hierarchical support if it exists... */
1503 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1506 /* 7. Figure out which controllers are supported */
1507 r = cg_mask_supported(&m->cgroup_supported);
1509 return log_error_errno(r, "Failed to determine supported controllers: %m");
1511 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1512 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1517 void manager_shutdown_cgroup(Manager *m, bool delete) {
1520 /* We can't really delete the group, since we are in it. But
1522 if (delete && m->cgroup_root)
1523 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1525 /// elogind does not support the unified hierarchy, yet.
1527 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1529 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1530 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1533 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1535 m->cgroup_root = mfree(m->cgroup_root);
1538 /// UNNEEDED by elogind
1540 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1547 u = hashmap_get(m->cgroup_unit, cgroup);
1551 p = strdupa(cgroup);
1555 e = strrchr(p, '/');
1557 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1561 u = hashmap_get(m->cgroup_unit, p);
1567 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1568 _cleanup_free_ char *cgroup = NULL;
1576 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1580 return manager_get_unit_by_cgroup(m, cgroup);
1583 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1592 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1594 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1598 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1602 return manager_get_unit_by_pid_cgroup(m, pid);
1605 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1611 u = manager_get_unit_by_cgroup(m, cgroup);
1615 return unit_notify_cgroup_empty(u);
1618 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1619 _cleanup_free_ char *v = NULL;
1625 if (!u->cgroup_path)
1628 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1631 if (cg_unified() <= 0)
1632 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1634 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1640 return safe_atou64(v, ret);
1643 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1644 _cleanup_free_ char *v = NULL;
1650 if (!u->cgroup_path)
1653 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1656 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1662 return safe_atou64(v, ret);
1665 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1666 _cleanup_free_ char *v = NULL;
1673 if (!u->cgroup_path)
1676 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1679 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1685 r = safe_atou64(v, &ns);
1693 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1697 r = unit_get_cpu_usage_raw(u, &ns);
1701 if (ns > u->cpuacct_usage_base)
1702 ns -= u->cpuacct_usage_base;
1710 int unit_reset_cpu_usage(Unit *u) {
1716 r = unit_get_cpu_usage_raw(u, &ns);
1718 u->cpuacct_usage_base = 0;
1722 u->cpuacct_usage_base = ns;
1726 bool unit_cgroup_delegate(Unit *u) {
1731 c = unit_get_cgroup_context(u);
1738 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1741 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1747 if ((u->cgroup_realized_mask & m) == 0)
1750 u->cgroup_realized_mask &= ~m;
1751 unit_add_to_cgroup_queue(u);
1754 void manager_invalidate_startup_units(Manager *m) {
1760 SET_FOREACH(u, m->startup_units, i)
1761 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1764 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1765 [CGROUP_AUTO] = "auto",
1766 [CGROUP_CLOSED] = "closed",
1767 [CGROUP_STRICT] = "strict",
1770 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);