1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include "alloc-util.h"
26 #include "cgroup-util.h"
31 #include "parse-util.h"
32 #include "path-util.h"
33 #include "process-util.h"
34 //#include "special.h"
35 #include "string-table.h"
36 #include "string-util.h"
38 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
40 #if 0 /// UNNEEDED by elogind
41 void cgroup_context_init(CGroupContext *c) {
44 /* Initialize everything to the kernel defaults, assuming the
45 * structure is preinitialized to 0 */
47 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
48 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
49 c->cpu_quota_per_sec_usec = USEC_INFINITY;
51 c->memory_limit = (uint64_t) -1;
53 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
54 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
56 c->tasks_max = (uint64_t) -1;
59 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
63 LIST_REMOVE(device_allow, c->device_allow, a);
68 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
72 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
77 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
81 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
86 void cgroup_context_done(CGroupContext *c) {
89 while (c->blockio_device_weights)
90 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
92 while (c->blockio_device_bandwidths)
93 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
95 while (c->device_allow)
96 cgroup_context_free_device_allow(c, c->device_allow);
99 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
100 CGroupBlockIODeviceBandwidth *b;
101 CGroupBlockIODeviceWeight *w;
102 CGroupDeviceAllow *a;
103 char u[FORMAT_TIMESPAN_MAX];
108 prefix = strempty(prefix);
111 "%sCPUAccounting=%s\n"
112 "%sBlockIOAccounting=%s\n"
113 "%sMemoryAccounting=%s\n"
114 "%sTasksAccounting=%s\n"
115 "%sCPUShares=%" PRIu64 "\n"
116 "%sStartupCPUShares=%" PRIu64 "\n"
117 "%sCPUQuotaPerSecSec=%s\n"
118 "%sBlockIOWeight=%" PRIu64 "\n"
119 "%sStartupBlockIOWeight=%" PRIu64 "\n"
120 "%sMemoryLimit=%" PRIu64 "\n"
121 "%sTasksMax=%" PRIu64 "\n"
122 "%sDevicePolicy=%s\n"
124 prefix, yes_no(c->cpu_accounting),
125 prefix, yes_no(c->blockio_accounting),
126 prefix, yes_no(c->memory_accounting),
127 prefix, yes_no(c->tasks_accounting),
128 prefix, c->cpu_shares,
129 prefix, c->startup_cpu_shares,
130 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
131 prefix, c->blockio_weight,
132 prefix, c->startup_blockio_weight,
133 prefix, c->memory_limit,
134 prefix, c->tasks_max,
135 prefix, cgroup_device_policy_to_string(c->device_policy),
136 prefix, yes_no(c->delegate));
138 LIST_FOREACH(device_allow, a, c->device_allow)
140 "%sDeviceAllow=%s %s%s%s\n",
143 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
145 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
147 "%sBlockIODeviceWeight=%s %" PRIu64,
152 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
153 char buf[FORMAT_BYTES_MAX];
158 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
160 format_bytes(buf, sizeof(buf), b->bandwidth));
164 static int lookup_blkio_device(const char *p, dev_t *dev) {
173 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
175 if (S_ISBLK(st.st_mode))
177 else if (major(st.st_dev) != 0) {
178 /* If this is not a device node then find the block
179 * device this file is stored on */
182 /* If this is a partition, try to get the originating
184 block_get_whole_disk(*dev, dev);
186 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
193 static int whitelist_device(const char *path, const char *node, const char *acc) {
194 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
201 if (stat(node, &st) < 0) {
202 log_warning("Couldn't stat device %s", node);
206 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
207 log_warning("%s is not a device.", node);
213 S_ISCHR(st.st_mode) ? 'c' : 'b',
214 major(st.st_rdev), minor(st.st_rdev),
217 r = cg_set_attribute("devices", path, "devices.allow", buf);
219 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
220 "Failed to set devices.allow on %s: %m", path);
225 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
226 _cleanup_fclose_ FILE *f = NULL;
233 assert(type == 'b' || type == 'c');
235 f = fopen("/proc/devices", "re");
237 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
239 FOREACH_LINE(line, f, goto fail) {
240 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
245 if (type == 'c' && streq(line, "Character devices:")) {
250 if (type == 'b' && streq(line, "Block devices:")) {
265 w = strpbrk(p, WHITESPACE);
270 r = safe_atou(p, &maj);
277 w += strspn(w, WHITESPACE);
279 if (fnmatch(name, w, 0) != 0)
288 r = cg_set_attribute("devices", path, "devices.allow", buf);
290 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
291 "Failed to set devices.allow on %s: %m", path);
297 log_warning_errno(errno, "Failed to read /proc/devices: %m");
301 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
311 /* Some cgroup attributes are not supported on the root cgroup,
312 * hence silently ignore */
313 is_root = isempty(path) || path_equal(path, "/");
315 /* Make sure we don't try to display messages with an empty path. */
318 /* We generally ignore errors caused by read-only mounted
319 * cgroup trees (assuming we are running in a container then),
320 * and missing cgroups, i.e. EROFS and ENOENT. */
322 if ((mask & CGROUP_MASK_CPU) && !is_root) {
323 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
325 sprintf(buf, "%" PRIu64 "\n",
326 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
327 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
328 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
330 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
331 "Failed to set cpu.shares on %s: %m", path);
333 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
334 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
336 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
337 "Failed to set cpu.cfs_period_us on %s: %m", path);
339 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
340 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
341 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
343 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
345 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
346 "Failed to set cpu.cfs_quota_us on %s: %m", path);
349 if (mask & CGROUP_MASK_BLKIO) {
350 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
351 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
352 CGroupBlockIODeviceWeight *w;
353 CGroupBlockIODeviceBandwidth *b;
356 sprintf(buf, "%" PRIu64 "\n",
357 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
358 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
359 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
361 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
362 "Failed to set blkio.weight on %s: %m", path);
364 /* FIXME: no way to reset this list */
365 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
368 r = lookup_blkio_device(w->path, &dev);
372 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
373 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
375 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
376 "Failed to set blkio.weight_device on %s: %m", path);
380 /* FIXME: no way to reset this list */
381 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
385 r = lookup_blkio_device(b->path, &dev);
389 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
391 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
392 r = cg_set_attribute("blkio", path, a, buf);
394 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
395 "Failed to set %s on %s: %m", a, path);
399 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
400 if (c->memory_limit != (uint64_t) -1) {
401 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
403 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
405 if (cg_unified() <= 0)
406 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
408 r = cg_set_attribute("memory", path, "memory.max", buf);
411 if (cg_unified() <= 0)
412 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
414 r = cg_set_attribute("memory", path, "memory.max", "max");
418 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
419 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
422 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
423 CGroupDeviceAllow *a;
425 /* Changing the devices list of a populated cgroup
426 * might result in EINVAL, hence ignore EINVAL
429 if (c->device_allow || c->device_policy != CGROUP_AUTO)
430 r = cg_set_attribute("devices", path, "devices.deny", "a");
432 r = cg_set_attribute("devices", path, "devices.allow", "a");
434 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
435 "Failed to reset devices.list on %s: %m", path);
437 if (c->device_policy == CGROUP_CLOSED ||
438 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
439 static const char auto_devices[] =
440 "/dev/null\0" "rwm\0"
441 "/dev/zero\0" "rwm\0"
442 "/dev/full\0" "rwm\0"
443 "/dev/random\0" "rwm\0"
444 "/dev/urandom\0" "rwm\0"
446 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
450 NULSTR_FOREACH_PAIR(x, y, auto_devices)
451 whitelist_device(path, x, y);
453 whitelist_major(path, "pts", 'c', "rw");
454 whitelist_major(path, "kdbus", 'c', "rw");
455 whitelist_major(path, "kdbus/*", 'c', "rw");
458 LIST_FOREACH(device_allow, a, c->device_allow) {
474 if (startswith(a->path, "/dev/"))
475 whitelist_device(path, a->path, acc);
476 else if (startswith(a->path, "block-"))
477 whitelist_major(path, a->path + 6, 'b', acc);
478 else if (startswith(a->path, "char-"))
479 whitelist_major(path, a->path + 5, 'c', acc);
481 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
485 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
487 if (c->tasks_max != (uint64_t) -1) {
488 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
490 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
491 r = cg_set_attribute("pids", path, "pids.max", buf);
493 r = cg_set_attribute("pids", path, "pids.max", "max");
496 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
497 "Failed to set pids.max on %s: %m", path);
501 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
504 /* Figure out which controllers we need */
506 if (c->cpu_accounting ||
507 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
508 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
509 c->cpu_quota_per_sec_usec != USEC_INFINITY)
510 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
512 if (c->blockio_accounting ||
513 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
514 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
515 c->blockio_device_weights ||
516 c->blockio_device_bandwidths)
517 mask |= CGROUP_MASK_BLKIO;
519 if (c->memory_accounting ||
520 c->memory_limit != (uint64_t) -1)
521 mask |= CGROUP_MASK_MEMORY;
523 if (c->device_allow ||
524 c->device_policy != CGROUP_AUTO)
525 mask |= CGROUP_MASK_DEVICES;
527 if (c->tasks_accounting ||
528 c->tasks_max != (uint64_t) -1)
529 mask |= CGROUP_MASK_PIDS;
534 CGroupMask unit_get_own_mask(Unit *u) {
537 /* Returns the mask of controllers the unit needs for itself */
539 c = unit_get_cgroup_context(u);
543 /* If delegation is turned on, then turn on all cgroups,
544 * unless we are on the legacy hierarchy and the process we
545 * fork into it is known to drop privileges, and hence
546 * shouldn't get access to the controllers.
548 * Note that on the unified hierarchy it is safe to delegate
549 * controllers to unprivileged services. */
554 e = unit_get_exec_context(u);
556 exec_context_maintains_privileges(e) ||
558 return _CGROUP_MASK_ALL;
561 return cgroup_context_get_mask(c);
564 CGroupMask unit_get_members_mask(Unit *u) {
567 /* Returns the mask of controllers all of the unit's children
570 if (u->cgroup_members_mask_valid)
571 return u->cgroup_members_mask;
573 u->cgroup_members_mask = 0;
575 if (u->type == UNIT_SLICE) {
579 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
584 if (UNIT_DEREF(member->slice) != u)
587 u->cgroup_members_mask |=
588 unit_get_own_mask(member) |
589 unit_get_members_mask(member);
593 u->cgroup_members_mask_valid = true;
594 return u->cgroup_members_mask;
597 CGroupMask unit_get_siblings_mask(Unit *u) {
600 /* Returns the mask of controllers all of the unit's siblings
601 * require, i.e. the members mask of the unit's parent slice
602 * if there is one. */
604 if (UNIT_ISSET(u->slice))
605 return unit_get_members_mask(UNIT_DEREF(u->slice));
607 return unit_get_own_mask(u) | unit_get_members_mask(u);
610 CGroupMask unit_get_subtree_mask(Unit *u) {
612 /* Returns the mask of this subtree, meaning of the group
613 * itself and its children. */
615 return unit_get_own_mask(u) | unit_get_members_mask(u);
618 CGroupMask unit_get_target_mask(Unit *u) {
621 /* This returns the cgroup mask of all controllers to enable
622 * for a specific cgroup, i.e. everything it needs itself,
623 * plus all that its children need, plus all that its siblings
624 * need. This is primarily useful on the legacy cgroup
625 * hierarchy, where we need to duplicate each cgroup in each
626 * hierarchy that shall be enabled for it. */
628 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
629 mask &= u->manager->cgroup_supported;
634 CGroupMask unit_get_enable_mask(Unit *u) {
637 /* This returns the cgroup mask of all controllers to enable
638 * for the children of a specific cgroup. This is primarily
639 * useful for the unified cgroup hierarchy, where each cgroup
640 * controls which controllers are enabled for its children. */
642 mask = unit_get_members_mask(u);
643 mask &= u->manager->cgroup_supported;
648 /* Recurse from a unit up through its containing slices, propagating
649 * mask bits upward. A unit is also member of itself. */
650 void unit_update_cgroup_members_masks(Unit *u) {
656 /* Calculate subtree mask */
657 m = unit_get_subtree_mask(u);
659 /* See if anything changed from the previous invocation. If
660 * not, we're done. */
661 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
665 u->cgroup_subtree_mask_valid &&
666 ((m & ~u->cgroup_subtree_mask) != 0) &&
667 ((~m & u->cgroup_subtree_mask) == 0);
669 u->cgroup_subtree_mask = m;
670 u->cgroup_subtree_mask_valid = true;
672 if (UNIT_ISSET(u->slice)) {
673 Unit *s = UNIT_DEREF(u->slice);
676 /* There's more set now than before. We
677 * propagate the new mask to the parent's mask
678 * (not caring if it actually was valid or
681 s->cgroup_members_mask |= m;
684 /* There's less set now than before (or we
685 * don't know), we need to recalculate
686 * everything, so let's invalidate the
687 * parent's members mask */
689 s->cgroup_members_mask_valid = false;
691 /* And now make sure that this change also hits our
693 unit_update_cgroup_members_masks(s);
697 static const char *migrate_callback(CGroupMask mask, void *userdata) {
704 if (u->cgroup_path &&
705 u->cgroup_realized &&
706 (u->cgroup_realized_mask & mask) == mask)
707 return u->cgroup_path;
709 u = UNIT_DEREF(u->slice);
715 char *unit_default_cgroup_path(Unit *u) {
716 _cleanup_free_ char *escaped = NULL, *slice = NULL;
721 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
722 return strdup(u->manager->cgroup_root);
724 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
725 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
730 escaped = cg_escape(u->id);
735 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
737 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
740 int unit_set_cgroup_path(Unit *u, const char *path) {
741 _cleanup_free_ char *p = NULL;
753 if (streq_ptr(u->cgroup_path, p))
757 r = hashmap_put(u->manager->cgroup_unit, p, u);
762 unit_release_cgroup(u);
770 int unit_watch_cgroup(Unit *u) {
771 _cleanup_free_ char *populated = NULL;
779 if (u->cgroup_inotify_wd >= 0)
782 /* Only applies to the unified hierarchy */
785 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
789 /* Don't watch the root slice, it's pointless. */
790 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
793 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
797 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
801 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
802 if (u->cgroup_inotify_wd < 0) {
804 if (errno == ENOENT) /* If the directory is already
805 * gone we don't need to track
806 * it, so this is not an error */
809 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
812 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
814 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
819 static int unit_create_cgroup(
821 CGroupMask target_mask,
822 CGroupMask enable_mask) {
829 c = unit_get_cgroup_context(u);
833 if (!u->cgroup_path) {
834 _cleanup_free_ char *path = NULL;
836 path = unit_default_cgroup_path(u);
840 r = unit_set_cgroup_path(u, path);
842 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
844 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
847 /* First, create our own group */
848 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
850 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
852 /* Start watching it */
853 (void) unit_watch_cgroup(u);
855 /* Enable all controllers we need */
856 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
858 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
860 /* Keep track that this is now realized */
861 u->cgroup_realized = true;
862 u->cgroup_realized_mask = target_mask;
864 if (u->type != UNIT_SLICE && !c->delegate) {
866 /* Then, possibly move things over, but not if
867 * subgroups may contain processes, which is the case
868 * for slice and delegation units. */
869 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
871 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
877 int unit_attach_pids_to_cgroup(Unit *u) {
881 r = unit_realize_cgroup(u);
885 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
892 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
895 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
898 /* Check if necessary controllers and attributes for a unit are in place.
901 * If not, create paths, move processes over, and set attributes.
903 * Returns 0 on success and < 0 on failure. */
904 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
905 CGroupMask target_mask, enable_mask;
910 if (u->in_cgroup_queue) {
911 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
912 u->in_cgroup_queue = false;
915 target_mask = unit_get_target_mask(u);
916 if (unit_has_mask_realized(u, target_mask))
919 /* First, realize parents */
920 if (UNIT_ISSET(u->slice)) {
921 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
926 /* And then do the real work */
927 enable_mask = unit_get_enable_mask(u);
928 r = unit_create_cgroup(u, target_mask, enable_mask);
932 /* Finally, apply the necessary attributes. */
933 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
938 static void unit_add_to_cgroup_queue(Unit *u) {
940 if (u->in_cgroup_queue)
943 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
944 u->in_cgroup_queue = true;
947 unsigned manager_dispatch_cgroup_queue(Manager *m) {
953 state = manager_state(m);
955 while ((i = m->cgroup_queue)) {
956 assert(i->in_cgroup_queue);
958 r = unit_realize_cgroup_now(i, state);
960 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
968 static void unit_queue_siblings(Unit *u) {
971 /* This adds the siblings of the specified unit and the
972 * siblings of all parent units to the cgroup queue. (But
973 * neither the specified unit itself nor the parents.) */
975 while ((slice = UNIT_DEREF(u->slice))) {
979 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
983 /* Skip units that have a dependency on the slice
984 * but aren't actually in it. */
985 if (UNIT_DEREF(m->slice) != slice)
988 /* No point in doing cgroup application for units
989 * without active processes. */
990 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
993 /* If the unit doesn't need any new controllers
994 * and has current ones realized, it doesn't need
996 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
999 unit_add_to_cgroup_queue(m);
1006 int unit_realize_cgroup(Unit *u) {
1009 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1012 /* So, here's the deal: when realizing the cgroups for this
1013 * unit, we need to first create all parents, but there's more
1014 * actually: for the weight-based controllers we also need to
1015 * make sure that all our siblings (i.e. units that are in the
1016 * same slice as we are) have cgroups, too. Otherwise, things
1017 * would become very uneven as each of their processes would
1018 * get as much resources as all our group together. This call
1019 * will synchronously create the parent cgroups, but will
1020 * defer work on the siblings to the next event loop
1023 /* Add all sibling slices to the cgroup queue. */
1024 unit_queue_siblings(u);
1026 /* And realize this one now (and apply the values) */
1027 return unit_realize_cgroup_now(u, manager_state(u->manager));
1030 void unit_release_cgroup(Unit *u) {
1033 /* Forgets all cgroup details for this cgroup */
1035 if (u->cgroup_path) {
1036 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1037 u->cgroup_path = mfree(u->cgroup_path);
1040 if (u->cgroup_inotify_wd >= 0) {
1041 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1042 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1044 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1045 u->cgroup_inotify_wd = -1;
1049 void unit_prune_cgroup(Unit *u) {
1055 /* Removes the cgroup, if empty and possible, and stops watching it. */
1057 if (!u->cgroup_path)
1060 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1062 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1064 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1071 unit_release_cgroup(u);
1073 u->cgroup_realized = false;
1074 u->cgroup_realized_mask = 0;
1077 int unit_search_main_pid(Unit *u, pid_t *ret) {
1078 _cleanup_fclose_ FILE *f = NULL;
1079 pid_t pid = 0, npid, mypid;
1085 if (!u->cgroup_path)
1088 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1093 while (cg_read_pid(f, &npid) > 0) {
1099 /* Ignore processes that aren't our kids */
1100 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1104 /* Dang, there's more than one daemonized PID
1105 in this group, so we don't know what process
1106 is the main process. */
1117 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1118 _cleanup_closedir_ DIR *d = NULL;
1119 _cleanup_fclose_ FILE *f = NULL;
1125 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1131 while ((r = cg_read_pid(f, &pid)) > 0) {
1132 r = unit_watch_pid(u, pid);
1133 if (r < 0 && ret >= 0)
1137 if (r < 0 && ret >= 0)
1141 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1148 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1149 _cleanup_free_ char *p = NULL;
1151 p = strjoin(path, "/", fn, NULL);
1157 r = unit_watch_pids_in_path(u, p);
1158 if (r < 0 && ret >= 0)
1162 if (r < 0 && ret >= 0)
1169 int unit_watch_all_pids(Unit *u) {
1172 /* Adds all PIDs from our cgroup to the set of PIDs we
1173 * watch. This is a fallback logic for cases where we do not
1174 * get reliable cgroup empty notifications: we try to use
1175 * SIGCHLD as replacement. */
1177 if (!u->cgroup_path)
1180 if (cg_unified() > 0) /* On unified we can use proper notifications */
1183 return unit_watch_pids_in_path(u, u->cgroup_path);
1186 int unit_notify_cgroup_empty(Unit *u) {
1191 if (!u->cgroup_path)
1194 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1198 unit_add_to_gc_queue(u);
1200 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1201 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1206 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1207 Manager *m = userdata;
1214 union inotify_event_buffer buffer;
1215 struct inotify_event *e;
1218 l = read(fd, &buffer, sizeof(buffer));
1220 if (errno == EINTR || errno == EAGAIN)
1223 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1226 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1230 /* Queue overflow has no watch descriptor */
1233 if (e->mask & IN_IGNORED)
1234 /* The watch was just removed */
1237 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1238 if (!u) /* Not that inotify might deliver
1239 * events for a watch even after it
1240 * was removed, because it was queued
1241 * before the removal. Let's ignore
1242 * this here safely. */
1245 (void) unit_notify_cgroup_empty(u);
1251 int manager_setup_cgroup(Manager *m) {
1252 _cleanup_free_ char *path = NULL;
1259 /* 1. Determine hierarchy */
1260 m->cgroup_root = mfree(m->cgroup_root);
1261 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1263 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1265 #if 0 /// elogind does not support systemd scopes and slices
1266 /* Chop off the init scope, if we are already located in it */
1267 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1269 /* LEGACY: Also chop off the system slice if we are in
1270 * it. This is to support live upgrades from older systemd
1271 * versions where PID 1 was moved there. Also see
1272 * cg_get_root_path(). */
1273 if (!e && m->running_as == MANAGER_SYSTEM) {
1274 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1276 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1282 /* And make sure to store away the root value without trailing
1283 * slash, even for the root dir, so that we can easily prepend
1285 while ((e = endswith(m->cgroup_root, "/")))
1287 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1288 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1291 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1293 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1295 unified = cg_unified();
1297 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1299 log_debug("Unified cgroup hierarchy is located at %s.", path);
1301 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1304 const char *scope_path;
1306 /* 3. Install agent */
1309 /* In the unified hierarchy we can can get
1310 * cgroup empty notifications via inotify. */
1312 #if 0 /// elogind does not support the unified hierarchy, yet.
1313 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1314 safe_close(m->cgroup_inotify_fd);
1316 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1317 if (m->cgroup_inotify_fd < 0)
1318 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1320 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1322 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1324 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1326 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1328 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1331 return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1333 } else if (m->running_as == MANAGER_SYSTEM) {
1335 /* On the legacy hierarchy we only get
1336 * notifications via cgroup agents. (Which
1337 * isn't really reliable, since it does not
1338 * generate events when control groups with
1339 * children run empty. */
1341 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1343 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1345 log_debug("Installed release agent.");
1347 log_debug("Release agent already installed.");
1350 #if 0 /// elogind is not meant to run in systemd init scope
1351 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1352 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1353 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1355 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1356 // we are our own cgroup controller
1357 scope_path = strjoina("");
1358 else if (streq(m->cgroup_root, "/elogind"))
1359 // root already is our cgroup
1360 scope_path = strjoina(m->cgroup_root);
1362 // we have to create our own group
1363 scope_path = strjoina(m->cgroup_root, "/elogind");
1364 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1367 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1368 log_debug_elogind("Created control group \"%s\"", scope_path);
1370 /* also, move all other userspace processes remaining
1371 * in the root cgroup into that scope. */
1372 if (!streq(m->cgroup_root, scope_path)) {
1373 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1375 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1378 /* 5. And pin it, so that it cannot be unmounted */
1379 safe_close(m->pin_cgroupfs_fd);
1380 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1381 if (m->pin_cgroupfs_fd < 0)
1382 return log_error_errno(errno, "Failed to open pin file: %m");
1384 /* 6. Always enable hierarchical support if it exists... */
1386 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1389 /* 7. Figure out which controllers are supported */
1390 r = cg_mask_supported(&m->cgroup_supported);
1392 return log_error_errno(r, "Failed to determine supported controllers: %m");
1394 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1395 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1400 void manager_shutdown_cgroup(Manager *m, bool delete) {
1403 /* We can't really delete the group, since we are in it. But
1405 if (delete && m->cgroup_root)
1406 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1408 #if 0 /// elogind does not support the unified hierarchy, yet.
1409 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1411 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1412 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1415 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1417 m->cgroup_root = mfree(m->cgroup_root);
1420 #if 0 /// UNNEEDED by elogind
1421 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1428 u = hashmap_get(m->cgroup_unit, cgroup);
1432 p = strdupa(cgroup);
1436 e = strrchr(p, '/');
1438 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1442 u = hashmap_get(m->cgroup_unit, p);
1448 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1449 _cleanup_free_ char *cgroup = NULL;
1457 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1461 return manager_get_unit_by_cgroup(m, cgroup);
1464 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1473 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1475 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1479 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1483 return manager_get_unit_by_pid_cgroup(m, pid);
1486 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1492 u = manager_get_unit_by_cgroup(m, cgroup);
1496 return unit_notify_cgroup_empty(u);
1499 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1500 _cleanup_free_ char *v = NULL;
1506 if (!u->cgroup_path)
1509 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1512 if (cg_unified() <= 0)
1513 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1515 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1521 return safe_atou64(v, ret);
1524 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1525 _cleanup_free_ char *v = NULL;
1531 if (!u->cgroup_path)
1534 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1537 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1543 return safe_atou64(v, ret);
1546 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1547 _cleanup_free_ char *v = NULL;
1554 if (!u->cgroup_path)
1557 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1560 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1566 r = safe_atou64(v, &ns);
1574 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1578 r = unit_get_cpu_usage_raw(u, &ns);
1582 if (ns > u->cpuacct_usage_base)
1583 ns -= u->cpuacct_usage_base;
1591 int unit_reset_cpu_usage(Unit *u) {
1597 r = unit_get_cpu_usage_raw(u, &ns);
1599 u->cpuacct_usage_base = 0;
1603 u->cpuacct_usage_base = ns;
1607 bool unit_cgroup_delegate(Unit *u) {
1612 c = unit_get_cgroup_context(u);
1619 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1622 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1628 if ((u->cgroup_realized_mask & m) == 0)
1631 u->cgroup_realized_mask &= ~m;
1632 unit_add_to_cgroup_queue(u);
1635 void manager_invalidate_startup_units(Manager *m) {
1641 SET_FOREACH(u, m->startup_units, i)
1642 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1645 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1646 [CGROUP_AUTO] = "auto",
1647 [CGROUP_CLOSED] = "closed",
1648 [CGROUP_STRICT] = "strict",
1651 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);