1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 //#include "blockdev-util.h"
26 //#include "bpf-firewall.h"
27 #include "cgroup-util.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "process-util.h"
35 //#include "procfs-util.h"
36 //#include "special.h"
37 #include "stdio-util.h"
38 #include "string-table.h"
39 #include "string-util.h"
41 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
43 bool unit_has_root_cgroup(Unit *u) {
46 /* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
47 * as inside of containers the root slice won't be identical to the root cgroup. */
52 return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
55 #if 0 /// UNNEEDED by elogind
56 static void cgroup_compat_warn(void) {
57 static bool cgroup_compat_warned = false;
59 if (cgroup_compat_warned)
62 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
63 cgroup_compat_warned = true;
66 #define log_cgroup_compat(unit, fmt, ...) do { \
67 cgroup_compat_warn(); \
68 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
71 void cgroup_context_init(CGroupContext *c) {
74 /* Initialize everything to the kernel defaults, assuming the
75 * structure is preinitialized to 0 */
77 c->cpu_weight = CGROUP_WEIGHT_INVALID;
78 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
79 c->cpu_quota_per_sec_usec = USEC_INFINITY;
81 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
82 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
84 c->memory_high = CGROUP_LIMIT_MAX;
85 c->memory_max = CGROUP_LIMIT_MAX;
86 c->memory_swap_max = CGROUP_LIMIT_MAX;
88 c->memory_limit = CGROUP_LIMIT_MAX;
90 c->io_weight = CGROUP_WEIGHT_INVALID;
91 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
93 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
94 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
96 c->tasks_max = (uint64_t) -1;
99 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
103 LIST_REMOVE(device_allow, c->device_allow, a);
108 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
112 LIST_REMOVE(device_weights, c->io_device_weights, w);
117 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
121 LIST_REMOVE(device_limits, c->io_device_limits, l);
126 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
130 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
135 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
139 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
144 void cgroup_context_done(CGroupContext *c) {
147 while (c->io_device_weights)
148 cgroup_context_free_io_device_weight(c, c->io_device_weights);
150 while (c->io_device_limits)
151 cgroup_context_free_io_device_limit(c, c->io_device_limits);
153 while (c->blockio_device_weights)
154 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
156 while (c->blockio_device_bandwidths)
157 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
159 while (c->device_allow)
160 cgroup_context_free_device_allow(c, c->device_allow);
162 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
163 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
166 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
167 CGroupIODeviceLimit *il;
168 CGroupIODeviceWeight *iw;
169 CGroupBlockIODeviceBandwidth *b;
170 CGroupBlockIODeviceWeight *w;
171 CGroupDeviceAllow *a;
172 IPAddressAccessItem *iaai;
173 char u[FORMAT_TIMESPAN_MAX];
178 prefix = strempty(prefix);
181 "%sCPUAccounting=%s\n"
182 "%sIOAccounting=%s\n"
183 "%sBlockIOAccounting=%s\n"
184 "%sMemoryAccounting=%s\n"
185 "%sTasksAccounting=%s\n"
186 "%sIPAccounting=%s\n"
187 "%sCPUWeight=%" PRIu64 "\n"
188 "%sStartupCPUWeight=%" PRIu64 "\n"
189 "%sCPUShares=%" PRIu64 "\n"
190 "%sStartupCPUShares=%" PRIu64 "\n"
191 "%sCPUQuotaPerSecSec=%s\n"
192 "%sIOWeight=%" PRIu64 "\n"
193 "%sStartupIOWeight=%" PRIu64 "\n"
194 "%sBlockIOWeight=%" PRIu64 "\n"
195 "%sStartupBlockIOWeight=%" PRIu64 "\n"
196 "%sMemoryLow=%" PRIu64 "\n"
197 "%sMemoryHigh=%" PRIu64 "\n"
198 "%sMemoryMax=%" PRIu64 "\n"
199 "%sMemorySwapMax=%" PRIu64 "\n"
200 "%sMemoryLimit=%" PRIu64 "\n"
201 "%sTasksMax=%" PRIu64 "\n"
202 "%sDevicePolicy=%s\n"
204 prefix, yes_no(c->cpu_accounting),
205 prefix, yes_no(c->io_accounting),
206 prefix, yes_no(c->blockio_accounting),
207 prefix, yes_no(c->memory_accounting),
208 prefix, yes_no(c->tasks_accounting),
209 prefix, yes_no(c->ip_accounting),
210 prefix, c->cpu_weight,
211 prefix, c->startup_cpu_weight,
212 prefix, c->cpu_shares,
213 prefix, c->startup_cpu_shares,
214 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
215 prefix, c->io_weight,
216 prefix, c->startup_io_weight,
217 prefix, c->blockio_weight,
218 prefix, c->startup_blockio_weight,
219 prefix, c->memory_low,
220 prefix, c->memory_high,
221 prefix, c->memory_max,
222 prefix, c->memory_swap_max,
223 prefix, c->memory_limit,
224 prefix, c->tasks_max,
225 prefix, cgroup_device_policy_to_string(c->device_policy),
226 prefix, yes_no(c->delegate));
229 _cleanup_free_ char *t = NULL;
231 (void) cg_mask_to_string(c->delegate_controllers, &t);
233 fprintf(f, "%sDelegateControllers=%s\n",
238 LIST_FOREACH(device_allow, a, c->device_allow)
240 "%sDeviceAllow=%s %s%s%s\n",
243 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
245 LIST_FOREACH(device_weights, iw, c->io_device_weights)
247 "%sIODeviceWeight=%s %" PRIu64,
252 LIST_FOREACH(device_limits, il, c->io_device_limits) {
253 char buf[FORMAT_BYTES_MAX];
254 CGroupIOLimitType type;
256 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
257 if (il->limits[type] != cgroup_io_limit_defaults[type])
261 cgroup_io_limit_type_to_string(type),
263 format_bytes(buf, sizeof(buf), il->limits[type]));
266 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
268 "%sBlockIODeviceWeight=%s %" PRIu64,
273 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
274 char buf[FORMAT_BYTES_MAX];
276 if (b->rbps != CGROUP_LIMIT_MAX)
278 "%sBlockIOReadBandwidth=%s %s\n",
281 format_bytes(buf, sizeof(buf), b->rbps));
282 if (b->wbps != CGROUP_LIMIT_MAX)
284 "%sBlockIOWriteBandwidth=%s %s\n",
287 format_bytes(buf, sizeof(buf), b->wbps));
290 LIST_FOREACH(items, iaai, c->ip_address_allow) {
291 _cleanup_free_ char *k = NULL;
293 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
294 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
297 LIST_FOREACH(items, iaai, c->ip_address_deny) {
298 _cleanup_free_ char *k = NULL;
300 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
301 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
305 static int lookup_block_device(const char *p, dev_t *dev) {
314 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
316 if (S_ISBLK(st.st_mode))
318 else if (major(st.st_dev) != 0) {
319 /* If this is not a device node then find the block
320 * device this file is stored on */
323 /* If this is a partition, try to get the originating
325 (void) block_get_whole_disk(*dev, dev);
327 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
334 static int whitelist_device(const char *path, const char *node, const char *acc) {
335 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
337 bool ignore_notfound;
343 if (node[0] == '-') {
344 /* Non-existent paths starting with "-" must be silently ignored */
346 ignore_notfound = true;
348 ignore_notfound = false;
350 if (stat(node, &st) < 0) {
351 if (errno == ENOENT && ignore_notfound)
354 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
357 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
358 log_warning("%s is not a device.", node);
364 S_ISCHR(st.st_mode) ? 'c' : 'b',
365 major(st.st_rdev), minor(st.st_rdev),
368 r = cg_set_attribute("devices", path, "devices.allow", buf);
370 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
371 "Failed to set devices.allow on %s: %m", path);
376 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
377 _cleanup_fclose_ FILE *f = NULL;
384 assert(IN_SET(type, 'b', 'c'));
386 f = fopen("/proc/devices", "re");
388 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
390 FOREACH_LINE(line, f, goto fail) {
391 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
396 if (type == 'c' && streq(line, "Character devices:")) {
401 if (type == 'b' && streq(line, "Block devices:")) {
416 w = strpbrk(p, WHITESPACE);
421 r = safe_atou(p, &maj);
428 w += strspn(w, WHITESPACE);
430 if (fnmatch(name, w, 0) != 0)
439 r = cg_set_attribute("devices", path, "devices.allow", buf);
441 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
442 "Failed to set devices.allow on %s: %m", path);
448 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
451 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
452 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
453 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
456 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
457 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
458 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
461 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
462 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
463 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
464 return c->startup_cpu_weight;
465 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
466 return c->cpu_weight;
468 return CGROUP_WEIGHT_DEFAULT;
471 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
472 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
473 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
474 return c->startup_cpu_shares;
475 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
476 return c->cpu_shares;
478 return CGROUP_CPU_SHARES_DEFAULT;
481 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
482 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
485 xsprintf(buf, "%" PRIu64 "\n", weight);
486 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
488 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
489 "Failed to set cpu.weight: %m");
491 if (quota != USEC_INFINITY)
492 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
493 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
495 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
497 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
500 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
501 "Failed to set cpu.max: %m");
504 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
505 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
508 xsprintf(buf, "%" PRIu64 "\n", shares);
509 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
511 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
512 "Failed to set cpu.shares: %m");
514 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
515 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
517 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
518 "Failed to set cpu.cfs_period_us: %m");
520 if (quota != USEC_INFINITY) {
521 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
522 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
524 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
526 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
527 "Failed to set cpu.cfs_quota_us: %m");
530 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
531 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
532 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
535 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
536 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
537 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
540 static bool cgroup_context_has_io_config(CGroupContext *c) {
541 return c->io_accounting ||
542 c->io_weight != CGROUP_WEIGHT_INVALID ||
543 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
544 c->io_device_weights ||
548 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
549 return c->blockio_accounting ||
550 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
551 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
552 c->blockio_device_weights ||
553 c->blockio_device_bandwidths;
556 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
557 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
558 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
559 return c->startup_io_weight;
560 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
563 return CGROUP_WEIGHT_DEFAULT;
566 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
567 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
568 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
569 return c->startup_blockio_weight;
570 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
571 return c->blockio_weight;
573 return CGROUP_BLKIO_WEIGHT_DEFAULT;
576 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
577 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
578 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
581 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
582 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
583 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
586 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
587 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
591 r = lookup_block_device(dev_path, &dev);
595 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
596 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
598 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
599 "Failed to set io.weight: %m");
602 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
603 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
607 r = lookup_block_device(dev_path, &dev);
611 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
612 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
614 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
615 "Failed to set blkio.weight_device: %m");
618 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
619 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
620 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
621 CGroupIOLimitType type;
626 r = lookup_block_device(dev_path, &dev);
630 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
631 if (limits[type] != cgroup_io_limit_defaults[type]) {
632 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
635 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
639 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
640 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
641 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
642 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
644 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
645 "Failed to set io.max: %m");
649 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
650 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
655 r = lookup_block_device(dev_path, &dev);
659 if (rbps != CGROUP_LIMIT_MAX)
661 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
662 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
664 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
665 "Failed to set blkio.throttle.read_bps_device: %m");
667 if (wbps != CGROUP_LIMIT_MAX)
669 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
670 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
672 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
673 "Failed to set blkio.throttle.write_bps_device: %m");
678 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
679 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
682 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
683 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
686 if (v != CGROUP_LIMIT_MAX)
687 xsprintf(buf, "%" PRIu64 "\n", v);
689 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
691 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
692 "Failed to set %s: %m", file);
695 static void cgroup_apply_firewall(Unit *u) {
700 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
701 * not recursive we don't ever touch the bpf on them */
704 r = bpf_firewall_compile(u);
708 (void) bpf_firewall_install(u);
712 static void cgroup_context_apply(
714 CGroupMask apply_mask,
716 ManagerState state) {
725 /* Nothing to do? Exit early! */
726 if (apply_mask == 0 && !apply_bpf)
729 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
730 is_root = unit_has_root_cgroup(u);
732 assert_se(c = unit_get_cgroup_context(u));
733 assert_se(path = u->cgroup_path);
735 if (is_root) /* Make sure we don't try to display messages with an empty path. */
738 /* We generally ignore errors caused by read-only mounted
739 * cgroup trees (assuming we are running in a container then),
740 * and missing cgroups, i.e. EROFS and ENOENT. */
742 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
743 bool has_weight, has_shares;
745 has_weight = cgroup_context_has_cpu_weight(c);
746 has_shares = cgroup_context_has_cpu_shares(c);
748 if (cg_all_unified() > 0) {
752 weight = cgroup_context_cpu_weight(c, state);
753 else if (has_shares) {
754 uint64_t shares = cgroup_context_cpu_shares(c, state);
756 weight = cgroup_cpu_shares_to_weight(shares);
758 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
759 shares, weight, path);
761 weight = CGROUP_WEIGHT_DEFAULT;
763 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
768 uint64_t weight = cgroup_context_cpu_weight(c, state);
770 shares = cgroup_cpu_weight_to_shares(weight);
772 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
773 weight, shares, path);
774 } else if (has_shares)
775 shares = cgroup_context_cpu_shares(c, state);
777 shares = CGROUP_CPU_SHARES_DEFAULT;
779 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
783 if (apply_mask & CGROUP_MASK_IO) {
784 bool has_io = cgroup_context_has_io_config(c);
785 bool has_blockio = cgroup_context_has_blockio_config(c);
788 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
792 weight = cgroup_context_io_weight(c, state);
793 else if (has_blockio) {
794 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
796 weight = cgroup_weight_blkio_to_io(blkio_weight);
798 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
799 blkio_weight, weight);
801 weight = CGROUP_WEIGHT_DEFAULT;
803 xsprintf(buf, "default %" PRIu64 "\n", weight);
804 r = cg_set_attribute("io", path, "io.weight", buf);
806 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
807 "Failed to set io.weight: %m");
810 CGroupIODeviceWeight *w;
812 /* FIXME: no way to reset this list */
813 LIST_FOREACH(device_weights, w, c->io_device_weights)
814 cgroup_apply_io_device_weight(u, w->path, w->weight);
815 } else if (has_blockio) {
816 CGroupBlockIODeviceWeight *w;
818 /* FIXME: no way to reset this list */
819 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
820 weight = cgroup_weight_blkio_to_io(w->weight);
822 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
823 w->weight, weight, w->path);
825 cgroup_apply_io_device_weight(u, w->path, weight);
830 /* Apply limits and free ones without config. */
832 CGroupIODeviceLimit *l, *next;
834 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
835 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
836 cgroup_context_free_io_device_limit(c, l);
838 } else if (has_blockio) {
839 CGroupBlockIODeviceBandwidth *b, *next;
841 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
842 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
843 CGroupIOLimitType type;
845 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
846 limits[type] = cgroup_io_limit_defaults[type];
848 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
849 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
851 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
852 b->rbps, b->wbps, b->path);
854 if (!cgroup_apply_io_device_limit(u, b->path, limits))
855 cgroup_context_free_blockio_device_bandwidth(c, b);
860 if (apply_mask & CGROUP_MASK_BLKIO) {
861 bool has_io = cgroup_context_has_io_config(c);
862 bool has_blockio = cgroup_context_has_blockio_config(c);
865 char buf[DECIMAL_STR_MAX(uint64_t)+1];
869 uint64_t io_weight = cgroup_context_io_weight(c, state);
871 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
873 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
875 } else if (has_blockio)
876 weight = cgroup_context_blkio_weight(c, state);
878 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
880 xsprintf(buf, "%" PRIu64 "\n", weight);
881 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
883 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
884 "Failed to set blkio.weight: %m");
887 CGroupIODeviceWeight *w;
889 /* FIXME: no way to reset this list */
890 LIST_FOREACH(device_weights, w, c->io_device_weights) {
891 weight = cgroup_weight_io_to_blkio(w->weight);
893 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
894 w->weight, weight, w->path);
896 cgroup_apply_blkio_device_weight(u, w->path, weight);
898 } else if (has_blockio) {
899 CGroupBlockIODeviceWeight *w;
901 /* FIXME: no way to reset this list */
902 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
903 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
907 /* Apply limits and free ones without config. */
909 CGroupIODeviceLimit *l, *next;
911 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
912 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
913 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
915 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
916 cgroup_context_free_io_device_limit(c, l);
918 } else if (has_blockio) {
919 CGroupBlockIODeviceBandwidth *b, *next;
921 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
922 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
923 cgroup_context_free_blockio_device_bandwidth(c, b);
927 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
928 if (cg_all_unified() > 0) {
929 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
931 if (cgroup_context_has_unified_memory_config(c)) {
933 swap_max = c->memory_swap_max;
935 max = c->memory_limit;
937 if (max != CGROUP_LIMIT_MAX)
938 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
941 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
942 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
943 cgroup_apply_unified_memory_limit(u, "memory.max", max);
944 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
946 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
949 if (cgroup_context_has_unified_memory_config(c)) {
951 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
953 val = c->memory_limit;
955 if (val == CGROUP_LIMIT_MAX)
956 strncpy(buf, "-1\n", sizeof(buf));
958 xsprintf(buf, "%" PRIu64 "\n", val);
960 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
962 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
963 "Failed to set memory.limit_in_bytes: %m");
967 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
968 CGroupDeviceAllow *a;
970 /* Changing the devices list of a populated cgroup
971 * might result in EINVAL, hence ignore EINVAL
974 if (c->device_allow || c->device_policy != CGROUP_AUTO)
975 r = cg_set_attribute("devices", path, "devices.deny", "a");
977 r = cg_set_attribute("devices", path, "devices.allow", "a");
979 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
980 "Failed to reset devices.list: %m");
982 if (c->device_policy == CGROUP_CLOSED ||
983 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
984 static const char auto_devices[] =
985 "/dev/null\0" "rwm\0"
986 "/dev/zero\0" "rwm\0"
987 "/dev/full\0" "rwm\0"
988 "/dev/random\0" "rwm\0"
989 "/dev/urandom\0" "rwm\0"
991 "/dev/ptmx\0" "rwm\0"
992 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
993 "-/run/systemd/inaccessible/chr\0" "rwm\0"
994 "-/run/systemd/inaccessible/blk\0" "rwm\0";
998 NULSTR_FOREACH_PAIR(x, y, auto_devices)
999 whitelist_device(path, x, y);
1001 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1002 whitelist_major(path, "pts", 'c', "rw");
1005 LIST_FOREACH(device_allow, a, c->device_allow) {
1021 if (path_startswith(a->path, "/dev/"))
1022 whitelist_device(path, a->path, acc);
1023 else if ((val = startswith(a->path, "block-")))
1024 whitelist_major(path, val, 'b', acc);
1025 else if ((val = startswith(a->path, "char-")))
1026 whitelist_major(path, val, 'c', acc);
1028 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1032 if (apply_mask & CGROUP_MASK_PIDS) {
1035 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1036 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1037 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1038 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1039 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1040 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1041 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1042 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1043 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1044 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1045 * which is desirable so that there's an offical way to release control of the sysctl from
1046 * systemd: set the limit to unbounded and reload. */
1048 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1049 u->manager->sysctl_pid_max_changed = true;
1050 r = procfs_tasks_set_limit(c->tasks_max);
1051 } else if (u->manager->sysctl_pid_max_changed)
1052 r = procfs_tasks_set_limit(TASKS_MAX);
1057 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1058 "Failed to write to tasks limit sysctls: %m");
1061 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1062 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1064 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1065 r = cg_set_attribute("pids", path, "pids.max", buf);
1067 r = cg_set_attribute("pids", path, "pids.max", "max");
1069 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070 "Failed to set pids.max: %m");
1075 cgroup_apply_firewall(u);
1078 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1079 CGroupMask mask = 0;
1081 /* Figure out which controllers we need */
1083 if (c->cpu_accounting ||
1084 cgroup_context_has_cpu_weight(c) ||
1085 cgroup_context_has_cpu_shares(c) ||
1086 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1087 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1089 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1090 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1092 if (c->memory_accounting ||
1093 c->memory_limit != CGROUP_LIMIT_MAX ||
1094 cgroup_context_has_unified_memory_config(c))
1095 mask |= CGROUP_MASK_MEMORY;
1097 if (c->device_allow ||
1098 c->device_policy != CGROUP_AUTO)
1099 mask |= CGROUP_MASK_DEVICES;
1101 if (c->tasks_accounting ||
1102 c->tasks_max != CGROUP_LIMIT_MAX)
1103 mask |= CGROUP_MASK_PIDS;
1108 CGroupMask unit_get_own_mask(Unit *u) {
1111 /* Returns the mask of controllers the unit needs for itself */
1113 c = unit_get_cgroup_context(u);
1117 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1120 CGroupMask unit_get_delegate_mask(Unit *u) {
1123 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1124 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1126 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1128 if (u->type == UNIT_SLICE)
1131 c = unit_get_cgroup_context(u);
1138 if (cg_all_unified() <= 0) {
1141 e = unit_get_exec_context(u);
1142 if (e && !exec_context_maintains_privileges(e))
1146 return c->delegate_controllers;
1149 CGroupMask unit_get_members_mask(Unit *u) {
1152 /* Returns the mask of controllers all of the unit's children require, merged */
1154 if (u->cgroup_members_mask_valid)
1155 return u->cgroup_members_mask;
1157 u->cgroup_members_mask = 0;
1159 if (u->type == UNIT_SLICE) {
1164 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1169 if (UNIT_DEREF(member->slice) != u)
1172 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1176 u->cgroup_members_mask_valid = true;
1177 return u->cgroup_members_mask;
1180 CGroupMask unit_get_siblings_mask(Unit *u) {
1183 /* Returns the mask of controllers all of the unit's siblings
1184 * require, i.e. the members mask of the unit's parent slice
1185 * if there is one. */
1187 if (UNIT_ISSET(u->slice))
1188 return unit_get_members_mask(UNIT_DEREF(u->slice));
1190 return unit_get_subtree_mask(u); /* we are the top-level slice */
1193 CGroupMask unit_get_subtree_mask(Unit *u) {
1195 /* Returns the mask of this subtree, meaning of the group
1196 * itself and its children. */
1198 return unit_get_own_mask(u) | unit_get_members_mask(u);
1201 CGroupMask unit_get_target_mask(Unit *u) {
1204 /* This returns the cgroup mask of all controllers to enable
1205 * for a specific cgroup, i.e. everything it needs itself,
1206 * plus all that its children need, plus all that its siblings
1207 * need. This is primarily useful on the legacy cgroup
1208 * hierarchy, where we need to duplicate each cgroup in each
1209 * hierarchy that shall be enabled for it. */
1211 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1212 mask &= u->manager->cgroup_supported;
1217 CGroupMask unit_get_enable_mask(Unit *u) {
1220 /* This returns the cgroup mask of all controllers to enable
1221 * for the children of a specific cgroup. This is primarily
1222 * useful for the unified cgroup hierarchy, where each cgroup
1223 * controls which controllers are enabled for its children. */
1225 mask = unit_get_members_mask(u);
1226 mask &= u->manager->cgroup_supported;
1231 bool unit_get_needs_bpf(Unit *u) {
1236 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1238 if (u->type == UNIT_SLICE)
1241 c = unit_get_cgroup_context(u);
1245 if (c->ip_accounting ||
1246 c->ip_address_allow ||
1250 /* If any parent slice has an IP access list defined, it applies too */
1251 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1252 c = unit_get_cgroup_context(p);
1256 if (c->ip_address_allow ||
1264 /* Recurse from a unit up through its containing slices, propagating
1265 * mask bits upward. A unit is also member of itself. */
1266 void unit_update_cgroup_members_masks(Unit *u) {
1272 /* Calculate subtree mask */
1273 m = unit_get_subtree_mask(u);
1275 /* See if anything changed from the previous invocation. If
1276 * not, we're done. */
1277 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1281 u->cgroup_subtree_mask_valid &&
1282 ((m & ~u->cgroup_subtree_mask) != 0) &&
1283 ((~m & u->cgroup_subtree_mask) == 0);
1285 u->cgroup_subtree_mask = m;
1286 u->cgroup_subtree_mask_valid = true;
1288 if (UNIT_ISSET(u->slice)) {
1289 Unit *s = UNIT_DEREF(u->slice);
1292 /* There's more set now than before. We
1293 * propagate the new mask to the parent's mask
1294 * (not caring if it actually was valid or
1297 s->cgroup_members_mask |= m;
1300 /* There's less set now than before (or we
1301 * don't know), we need to recalculate
1302 * everything, so let's invalidate the
1303 * parent's members mask */
1305 s->cgroup_members_mask_valid = false;
1307 /* And now make sure that this change also hits our
1309 unit_update_cgroup_members_masks(s);
1313 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1320 if (u->cgroup_path &&
1321 u->cgroup_realized &&
1322 (u->cgroup_realized_mask & mask) == mask)
1323 return u->cgroup_path;
1325 u = UNIT_DEREF(u->slice);
1331 char *unit_default_cgroup_path(Unit *u) {
1332 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1337 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1338 return strdup(u->manager->cgroup_root);
1340 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1341 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1346 escaped = cg_escape(u->id);
1351 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1354 return strjoin(u->manager->cgroup_root, "/", escaped);
1357 int unit_set_cgroup_path(Unit *u, const char *path) {
1358 _cleanup_free_ char *p = NULL;
1370 if (streq_ptr(u->cgroup_path, p))
1374 r = hashmap_put(u->manager->cgroup_unit, p, u);
1379 unit_release_cgroup(u);
1387 int unit_watch_cgroup(Unit *u) {
1388 _cleanup_free_ char *events = NULL;
1393 if (!u->cgroup_path)
1396 if (u->cgroup_inotify_wd >= 0)
1399 /* Only applies to the unified hierarchy */
1400 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1402 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1406 /* Don't watch the root slice, it's pointless. */
1407 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1410 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1414 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1418 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1419 if (u->cgroup_inotify_wd < 0) {
1421 if (errno == ENOENT) /* If the directory is already
1422 * gone we don't need to track
1423 * it, so this is not an error */
1426 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1429 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1431 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1436 int unit_pick_cgroup_path(Unit *u) {
1437 _cleanup_free_ char *path = NULL;
1445 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1448 path = unit_default_cgroup_path(u);
1452 r = unit_set_cgroup_path(u, path);
1454 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1456 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1461 static int unit_create_cgroup(
1463 CGroupMask target_mask,
1464 CGroupMask enable_mask,
1472 c = unit_get_cgroup_context(u);
1476 /* Figure out our cgroup path */
1477 r = unit_pick_cgroup_path(u);
1481 /* First, create our own group */
1482 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1484 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1486 /* Start watching it */
1487 (void) unit_watch_cgroup(u);
1489 /* Enable all controllers we need */
1490 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1492 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1494 /* Keep track that this is now realized */
1495 u->cgroup_realized = true;
1496 u->cgroup_realized_mask = target_mask;
1497 u->cgroup_enabled_mask = enable_mask;
1498 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1500 if (u->type != UNIT_SLICE && !c->delegate) {
1502 /* Then, possibly move things over, but not if
1503 * subgroups may contain processes, which is the case
1504 * for slice and delegation units. */
1505 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1507 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1513 int unit_attach_pids_to_cgroup(Unit *u) {
1517 r = unit_realize_cgroup(u);
1521 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1528 static void cgroup_xattr_apply(Unit *u) {
1529 char ids[SD_ID128_STRING_MAX];
1534 if (!MANAGER_IS_SYSTEM(u->manager))
1537 if (sd_id128_is_null(u->invocation_id))
1540 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1541 "trusted.invocation_id",
1542 sd_id128_to_string(u->invocation_id, ids), 32,
1545 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1548 static bool unit_has_mask_realized(
1550 CGroupMask target_mask,
1551 CGroupMask enable_mask,
1556 return u->cgroup_realized &&
1557 u->cgroup_realized_mask == target_mask &&
1558 u->cgroup_enabled_mask == enable_mask &&
1559 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1560 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1563 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1566 if (u->in_cgroup_realize_queue)
1569 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1570 u->in_cgroup_realize_queue = true;
1573 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1576 if (!u->in_cgroup_realize_queue)
1579 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1580 u->in_cgroup_realize_queue = false;
1584 /* Check if necessary controllers and attributes for a unit are in place.
1586 * If so, do nothing.
1587 * If not, create paths, move processes over, and set attributes.
1589 * Returns 0 on success and < 0 on failure. */
1590 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1591 CGroupMask target_mask, enable_mask;
1592 bool needs_bpf, apply_bpf;
1597 unit_remove_from_cgroup_realize_queue(u);
1599 target_mask = unit_get_target_mask(u);
1600 enable_mask = unit_get_enable_mask(u);
1601 needs_bpf = unit_get_needs_bpf(u);
1603 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1606 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1607 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1608 * this will trickle down properly to cgroupfs. */
1609 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1611 /* First, realize parents */
1612 if (UNIT_ISSET(u->slice)) {
1613 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1618 /* And then do the real work */
1619 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1623 /* Finally, apply the necessary attributes. */
1624 cgroup_context_apply(u, target_mask, apply_bpf, state);
1625 cgroup_xattr_apply(u);
1630 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1638 state = manager_state(m);
1640 while ((i = m->cgroup_realize_queue)) {
1641 assert(i->in_cgroup_realize_queue);
1643 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1644 /* Maybe things changed, and the unit is not actually active anymore? */
1645 unit_remove_from_cgroup_realize_queue(i);
1649 r = unit_realize_cgroup_now(i, state);
1651 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1659 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1662 /* This adds the siblings of the specified unit and the
1663 * siblings of all parent units to the cgroup queue. (But
1664 * neither the specified unit itself nor the parents.) */
1666 while ((slice = UNIT_DEREF(u->slice))) {
1671 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1675 /* Skip units that have a dependency on the slice
1676 * but aren't actually in it. */
1677 if (UNIT_DEREF(m->slice) != slice)
1680 /* No point in doing cgroup application for units
1681 * without active processes. */
1682 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1685 /* If the unit doesn't need any new controllers
1686 * and has current ones realized, it doesn't need
1688 if (unit_has_mask_realized(m,
1689 unit_get_target_mask(m),
1690 unit_get_enable_mask(m),
1691 unit_get_needs_bpf(m)))
1694 unit_add_to_cgroup_realize_queue(m);
1701 int unit_realize_cgroup(Unit *u) {
1704 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1707 /* So, here's the deal: when realizing the cgroups for this
1708 * unit, we need to first create all parents, but there's more
1709 * actually: for the weight-based controllers we also need to
1710 * make sure that all our siblings (i.e. units that are in the
1711 * same slice as we are) have cgroups, too. Otherwise, things
1712 * would become very uneven as each of their processes would
1713 * get as much resources as all our group together. This call
1714 * will synchronously create the parent cgroups, but will
1715 * defer work on the siblings to the next event loop
1718 /* Add all sibling slices to the cgroup queue. */
1719 unit_add_siblings_to_cgroup_realize_queue(u);
1721 /* And realize this one now (and apply the values) */
1722 return unit_realize_cgroup_now(u, manager_state(u->manager));
1725 void unit_release_cgroup(Unit *u) {
1728 /* Forgets all cgroup details for this cgroup */
1730 if (u->cgroup_path) {
1731 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1732 u->cgroup_path = mfree(u->cgroup_path);
1735 if (u->cgroup_inotify_wd >= 0) {
1736 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1737 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1739 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1740 u->cgroup_inotify_wd = -1;
1744 void unit_prune_cgroup(Unit *u) {
1750 /* Removes the cgroup, if empty and possible, and stops watching it. */
1752 if (!u->cgroup_path)
1755 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1757 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1759 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1761 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1768 unit_release_cgroup(u);
1770 u->cgroup_realized = false;
1771 u->cgroup_realized_mask = 0;
1772 u->cgroup_enabled_mask = 0;
1775 int unit_search_main_pid(Unit *u, pid_t *ret) {
1776 _cleanup_fclose_ FILE *f = NULL;
1777 pid_t pid = 0, npid, mypid;
1783 if (!u->cgroup_path)
1786 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1790 mypid = getpid_cached();
1791 while (cg_read_pid(f, &npid) > 0) {
1797 /* Ignore processes that aren't our kids */
1798 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1802 /* Dang, there's more than one daemonized PID
1803 in this group, so we don't know what process
1804 is the main process. */
1815 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1816 _cleanup_closedir_ DIR *d = NULL;
1817 _cleanup_fclose_ FILE *f = NULL;
1823 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1829 while ((r = cg_read_pid(f, &pid)) > 0) {
1830 r = unit_watch_pid(u, pid);
1831 if (r < 0 && ret >= 0)
1835 if (r < 0 && ret >= 0)
1839 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1846 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1847 _cleanup_free_ char *p = NULL;
1849 p = strjoin(path, "/", fn);
1855 r = unit_watch_pids_in_path(u, p);
1856 if (r < 0 && ret >= 0)
1860 if (r < 0 && ret >= 0)
1867 int unit_watch_all_pids(Unit *u) {
1872 /* Adds all PIDs from our cgroup to the set of PIDs we
1873 * watch. This is a fallback logic for cases where we do not
1874 * get reliable cgroup empty notifications: we try to use
1875 * SIGCHLD as replacement. */
1877 if (!u->cgroup_path)
1880 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1883 if (r > 0) /* On unified we can use proper notifications */
1886 return unit_watch_pids_in_path(u, u->cgroup_path);
1889 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1890 Manager *m = userdata;
1897 u = m->cgroup_empty_queue;
1901 assert(u->in_cgroup_empty_queue);
1902 u->in_cgroup_empty_queue = false;
1903 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1905 if (m->cgroup_empty_queue) {
1906 /* More stuff queued, let's make sure we remain enabled */
1907 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1909 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1912 unit_add_to_gc_queue(u);
1914 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1915 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1920 void unit_add_to_cgroup_empty_queue(Unit *u) {
1925 /* Note that there are four different ways how cgroup empty events reach us:
1927 * 1. On the unified hierarchy we get an inotify event on the cgroup
1929 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1931 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1933 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1934 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1936 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1937 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1938 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1939 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1940 * case for scope units). */
1942 if (u->in_cgroup_empty_queue)
1945 /* Let's verify that the cgroup is really empty */
1946 if (!u->cgroup_path)
1948 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1950 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1956 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1957 u->in_cgroup_empty_queue = true;
1959 /* Trigger the defer event */
1960 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1962 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1965 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1966 Manager *m = userdata;
1973 union inotify_event_buffer buffer;
1974 struct inotify_event *e;
1977 l = read(fd, &buffer, sizeof(buffer));
1979 if (IN_SET(errno, EINTR, EAGAIN))
1982 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1985 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1989 /* Queue overflow has no watch descriptor */
1992 if (e->mask & IN_IGNORED)
1993 /* The watch was just removed */
1996 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1997 if (!u) /* Not that inotify might deliver
1998 * events for a watch even after it
1999 * was removed, because it was queued
2000 * before the removal. Let's ignore
2001 * this here safely. */
2004 unit_add_to_cgroup_empty_queue(u);
2010 int manager_setup_cgroup(Manager *m) {
2011 _cleanup_free_ char *path = NULL;
2012 const char *scope_path;
2015 #if 0 /// UNNEEDED by elogind
2021 /* 1. Determine hierarchy */
2022 m->cgroup_root = mfree(m->cgroup_root);
2023 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2024 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2026 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2029 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2031 #if 0 /// elogind does not support systemd scopes and slices
2032 /* Chop off the init scope, if we are already located in it */
2033 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2035 /* LEGACY: Also chop off the system slice if we are in
2036 * it. This is to support live upgrades from older systemd
2037 * versions where PID 1 was moved there. Also see
2038 * cg_get_root_path(). */
2039 if (!e && MANAGER_IS_SYSTEM(m)) {
2040 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2042 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2048 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2049 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2050 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2051 * easily prepend it everywhere. */
2052 delete_trailing_chars(m->cgroup_root, "/");
2055 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2057 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2059 r = cg_unified_flush();
2061 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2063 all_unified = cg_all_unified();
2064 if (all_unified < 0)
2065 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2066 if (all_unified > 0)
2067 log_debug("Unified cgroup hierarchy is located at %s.", path);
2069 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2071 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2073 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2075 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2078 #if 0 /// elogind is not init, and does not install the agent here.
2079 /* 3. Allocate cgroup empty defer event source */
2080 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2081 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2083 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2085 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2087 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2089 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2091 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2093 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2095 /* 4. Install notifier inotify object, or agent */
2096 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2098 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2100 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2101 safe_close(m->cgroup_inotify_fd);
2103 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2104 if (m->cgroup_inotify_fd < 0)
2105 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2107 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2109 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2111 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2112 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2113 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2115 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2117 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2119 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2121 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2122 * since it does not generate events when control groups with children run empty. */
2124 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2126 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2128 log_debug("Installed release agent.");
2130 log_debug("Release agent already installed.");
2133 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2134 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2135 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2138 * This method is in core, and normally called by systemd
2139 * being init. As elogind is never init, we can not install
2140 * our agent here. We do so when mounting our cgroup file
2141 * system, so only if elogind is its own tiny controller.
2142 * Further, elogind is not meant to run in systemd init scope. */
2143 if (MANAGER_IS_SYSTEM(m))
2144 // we are our own cgroup controller
2145 scope_path = strjoina("");
2146 else if (streq(m->cgroup_root, "/elogind"))
2147 // root already is our cgroup
2148 scope_path = strjoina(m->cgroup_root);
2150 // we have to create our own group
2151 scope_path = strjoina(m->cgroup_root, "/elogind");
2152 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2155 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2156 log_debug_elogind("Created control group \"%s\"", scope_path);
2158 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2159 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2160 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2162 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2165 /* 6. And pin it, so that it cannot be unmounted */
2166 safe_close(m->pin_cgroupfs_fd);
2167 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2168 if (m->pin_cgroupfs_fd < 0)
2169 return log_error_errno(errno, "Failed to open pin file: %m");
2171 /* 7. Always enable hierarchical support if it exists... */
2172 if (!all_unified && m->test_run_flags == 0)
2173 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2175 /* 8. Figure out which controllers are supported, and log about it */
2176 r = cg_mask_supported(&m->cgroup_supported);
2178 return log_error_errno(r, "Failed to determine supported controllers: %m");
2179 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2180 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2185 void manager_shutdown_cgroup(Manager *m, bool delete) {
2188 /* We can't really delete the group, since we are in it. But
2190 if (delete && m->cgroup_root)
2191 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2193 #if 0 /// elogind is not init
2194 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2196 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2198 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2199 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2202 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2204 m->cgroup_root = mfree(m->cgroup_root);
2207 #if 0 /// UNNEEDED by elogind
2208 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2215 u = hashmap_get(m->cgroup_unit, cgroup);
2219 p = strdupa(cgroup);
2223 e = strrchr(p, '/');
2225 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2229 u = hashmap_get(m->cgroup_unit, p);
2235 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2236 _cleanup_free_ char *cgroup = NULL;
2244 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2248 return manager_get_unit_by_cgroup(m, cgroup);
2251 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2260 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2262 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2266 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2270 return manager_get_unit_by_pid_cgroup(m, pid);
2274 #if 0 /// elogind must substitute this with its own variant
2275 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2281 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2282 * or from the --system instance */
2284 log_debug("Got cgroup empty notification for: %s", cgroup);
2286 u = manager_get_unit_by_cgroup(m, cgroup);
2290 unit_add_to_cgroup_empty_queue(u);
2294 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2300 log_debug("Got cgroup empty notification for: %s", cgroup);
2302 s = hashmap_get(m->sessions, cgroup);
2305 session_finalize(s);
2308 log_warning("Session not found: %s", cgroup);
2313 #if 0 /// UNNEEDED by elogind
2314 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2315 _cleanup_free_ char *v = NULL;
2321 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2324 if (!u->cgroup_path)
2327 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2330 r = cg_all_unified();
2334 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2336 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2342 return safe_atou64(v, ret);
2345 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2346 _cleanup_free_ char *v = NULL;
2352 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2355 if (!u->cgroup_path)
2358 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2361 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2362 if (unit_has_root_cgroup(u))
2363 return procfs_tasks_get_current(ret);
2365 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2371 return safe_atou64(v, ret);
2374 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2375 _cleanup_free_ char *v = NULL;
2382 if (!u->cgroup_path)
2385 r = cg_all_unified();
2389 const char *keys[] = { "usage_usec", NULL };
2390 _cleanup_free_ char *val = NULL;
2393 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2396 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2400 r = safe_atou64(val, &us);
2404 ns = us * NSEC_PER_USEC;
2406 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2409 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2415 r = safe_atou64(v, &ns);
2424 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2430 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2431 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2432 * call this function with a NULL return value. */
2434 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2437 r = unit_get_cpu_usage_raw(u, &ns);
2438 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2439 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2443 *ret = u->cpu_usage_last;
2449 if (ns > u->cpu_usage_base)
2450 ns -= u->cpu_usage_base;
2454 u->cpu_usage_last = ns;
2461 int unit_get_ip_accounting(
2463 CGroupIPAccountingMetric metric,
2470 assert(metric >= 0);
2471 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2474 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2475 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2476 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2478 if (u->type == UNIT_SLICE)
2481 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2484 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2485 u->ip_accounting_ingress_map_fd :
2486 u->ip_accounting_egress_map_fd;
2490 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2491 r = bpf_firewall_read_accounting(fd, &value, NULL);
2493 r = bpf_firewall_read_accounting(fd, NULL, &value);
2497 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2498 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2499 * ip_accounting_extra[] field, and add them in here transparently. */
2501 *ret = value + u->ip_accounting_extra[metric];
2506 int unit_reset_cpu_accounting(Unit *u) {
2512 u->cpu_usage_last = NSEC_INFINITY;
2514 r = unit_get_cpu_usage_raw(u, &ns);
2516 u->cpu_usage_base = 0;
2520 u->cpu_usage_base = ns;
2524 int unit_reset_ip_accounting(Unit *u) {
2529 if (u->ip_accounting_ingress_map_fd >= 0)
2530 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2532 if (u->ip_accounting_egress_map_fd >= 0)
2533 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2535 zero(u->ip_accounting_extra);
2537 return r < 0 ? r : q;
2540 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2543 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2549 /* always invalidate compat pairs together */
2550 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2551 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2553 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2554 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2556 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2559 u->cgroup_realized_mask &= ~m;
2560 unit_add_to_cgroup_realize_queue(u);
2563 void unit_invalidate_cgroup_bpf(Unit *u) {
2566 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2569 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2572 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2573 unit_add_to_cgroup_realize_queue(u);
2575 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2576 * list of our children includes our own. */
2577 if (u->type == UNIT_SLICE) {
2582 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2586 if (UNIT_DEREF(member->slice) != u)
2589 unit_invalidate_cgroup_bpf(member);
2594 void manager_invalidate_startup_units(Manager *m) {
2600 SET_FOREACH(u, m->startup_units, i)
2601 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2604 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2605 [CGROUP_AUTO] = "auto",
2606 [CGROUP_CLOSED] = "closed",
2607 [CGROUP_STRICT] = "strict",
2610 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);