1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 //#include "blockdev-util.h"
26 //#include "bpf-firewall.h"
27 #include "cgroup-util.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "process-util.h"
35 //#include "special.h"
36 #include "stdio-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
42 #if 0 /// UNNEEDED by elogind
43 static void cgroup_compat_warn(void) {
44 static bool cgroup_compat_warned = false;
46 if (cgroup_compat_warned)
49 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
50 cgroup_compat_warned = true;
53 #define log_cgroup_compat(unit, fmt, ...) do { \
54 cgroup_compat_warn(); \
55 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
58 void cgroup_context_init(CGroupContext *c) {
61 /* Initialize everything to the kernel defaults, assuming the
62 * structure is preinitialized to 0 */
64 c->cpu_weight = CGROUP_WEIGHT_INVALID;
65 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
66 c->cpu_quota_per_sec_usec = USEC_INFINITY;
68 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
69 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
71 c->memory_high = CGROUP_LIMIT_MAX;
72 c->memory_max = CGROUP_LIMIT_MAX;
73 c->memory_swap_max = CGROUP_LIMIT_MAX;
75 c->memory_limit = CGROUP_LIMIT_MAX;
77 c->io_weight = CGROUP_WEIGHT_INVALID;
78 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
80 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
81 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
83 c->tasks_max = (uint64_t) -1;
86 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
90 LIST_REMOVE(device_allow, c->device_allow, a);
95 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
99 LIST_REMOVE(device_weights, c->io_device_weights, w);
104 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
108 LIST_REMOVE(device_limits, c->io_device_limits, l);
113 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
117 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
122 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
126 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
131 void cgroup_context_done(CGroupContext *c) {
134 while (c->io_device_weights)
135 cgroup_context_free_io_device_weight(c, c->io_device_weights);
137 while (c->io_device_limits)
138 cgroup_context_free_io_device_limit(c, c->io_device_limits);
140 while (c->blockio_device_weights)
141 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
143 while (c->blockio_device_bandwidths)
144 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
146 while (c->device_allow)
147 cgroup_context_free_device_allow(c, c->device_allow);
149 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
150 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
153 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
154 CGroupIODeviceLimit *il;
155 CGroupIODeviceWeight *iw;
156 CGroupBlockIODeviceBandwidth *b;
157 CGroupBlockIODeviceWeight *w;
158 CGroupDeviceAllow *a;
159 IPAddressAccessItem *iaai;
160 char u[FORMAT_TIMESPAN_MAX];
165 prefix = strempty(prefix);
168 "%sCPUAccounting=%s\n"
169 "%sIOAccounting=%s\n"
170 "%sBlockIOAccounting=%s\n"
171 "%sMemoryAccounting=%s\n"
172 "%sTasksAccounting=%s\n"
173 "%sIPAccounting=%s\n"
174 "%sCPUWeight=%" PRIu64 "\n"
175 "%sStartupCPUWeight=%" PRIu64 "\n"
176 "%sCPUShares=%" PRIu64 "\n"
177 "%sStartupCPUShares=%" PRIu64 "\n"
178 "%sCPUQuotaPerSecSec=%s\n"
179 "%sIOWeight=%" PRIu64 "\n"
180 "%sStartupIOWeight=%" PRIu64 "\n"
181 "%sBlockIOWeight=%" PRIu64 "\n"
182 "%sStartupBlockIOWeight=%" PRIu64 "\n"
183 "%sMemoryLow=%" PRIu64 "\n"
184 "%sMemoryHigh=%" PRIu64 "\n"
185 "%sMemoryMax=%" PRIu64 "\n"
186 "%sMemorySwapMax=%" PRIu64 "\n"
187 "%sMemoryLimit=%" PRIu64 "\n"
188 "%sTasksMax=%" PRIu64 "\n"
189 "%sDevicePolicy=%s\n"
191 prefix, yes_no(c->cpu_accounting),
192 prefix, yes_no(c->io_accounting),
193 prefix, yes_no(c->blockio_accounting),
194 prefix, yes_no(c->memory_accounting),
195 prefix, yes_no(c->tasks_accounting),
196 prefix, yes_no(c->ip_accounting),
197 prefix, c->cpu_weight,
198 prefix, c->startup_cpu_weight,
199 prefix, c->cpu_shares,
200 prefix, c->startup_cpu_shares,
201 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
202 prefix, c->io_weight,
203 prefix, c->startup_io_weight,
204 prefix, c->blockio_weight,
205 prefix, c->startup_blockio_weight,
206 prefix, c->memory_low,
207 prefix, c->memory_high,
208 prefix, c->memory_max,
209 prefix, c->memory_swap_max,
210 prefix, c->memory_limit,
211 prefix, c->tasks_max,
212 prefix, cgroup_device_policy_to_string(c->device_policy),
213 prefix, yes_no(c->delegate));
216 _cleanup_free_ char *t = NULL;
218 (void) cg_mask_to_string(c->delegate_controllers, &t);
220 fprintf(f, "%sDelegateControllers=%s\n",
225 LIST_FOREACH(device_allow, a, c->device_allow)
227 "%sDeviceAllow=%s %s%s%s\n",
230 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
232 LIST_FOREACH(device_weights, iw, c->io_device_weights)
234 "%sIODeviceWeight=%s %" PRIu64,
239 LIST_FOREACH(device_limits, il, c->io_device_limits) {
240 char buf[FORMAT_BYTES_MAX];
241 CGroupIOLimitType type;
243 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
244 if (il->limits[type] != cgroup_io_limit_defaults[type])
248 cgroup_io_limit_type_to_string(type),
250 format_bytes(buf, sizeof(buf), il->limits[type]));
253 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
255 "%sBlockIODeviceWeight=%s %" PRIu64,
260 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
261 char buf[FORMAT_BYTES_MAX];
263 if (b->rbps != CGROUP_LIMIT_MAX)
265 "%sBlockIOReadBandwidth=%s %s\n",
268 format_bytes(buf, sizeof(buf), b->rbps));
269 if (b->wbps != CGROUP_LIMIT_MAX)
271 "%sBlockIOWriteBandwidth=%s %s\n",
274 format_bytes(buf, sizeof(buf), b->wbps));
277 LIST_FOREACH(items, iaai, c->ip_address_allow) {
278 _cleanup_free_ char *k = NULL;
280 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
281 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
284 LIST_FOREACH(items, iaai, c->ip_address_deny) {
285 _cleanup_free_ char *k = NULL;
287 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
288 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
292 static int lookup_block_device(const char *p, dev_t *dev) {
301 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
303 if (S_ISBLK(st.st_mode))
305 else if (major(st.st_dev) != 0) {
306 /* If this is not a device node then find the block
307 * device this file is stored on */
310 /* If this is a partition, try to get the originating
312 (void) block_get_whole_disk(*dev, dev);
314 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
321 static int whitelist_device(const char *path, const char *node, const char *acc) {
322 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
324 bool ignore_notfound;
330 if (node[0] == '-') {
331 /* Non-existent paths starting with "-" must be silently ignored */
333 ignore_notfound = true;
335 ignore_notfound = false;
337 if (stat(node, &st) < 0) {
338 if (errno == ENOENT && ignore_notfound)
341 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
344 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
345 log_warning("%s is not a device.", node);
351 S_ISCHR(st.st_mode) ? 'c' : 'b',
352 major(st.st_rdev), minor(st.st_rdev),
355 r = cg_set_attribute("devices", path, "devices.allow", buf);
357 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
358 "Failed to set devices.allow on %s: %m", path);
363 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
364 _cleanup_fclose_ FILE *f = NULL;
371 assert(IN_SET(type, 'b', 'c'));
373 f = fopen("/proc/devices", "re");
375 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
377 FOREACH_LINE(line, f, goto fail) {
378 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
383 if (type == 'c' && streq(line, "Character devices:")) {
388 if (type == 'b' && streq(line, "Block devices:")) {
403 w = strpbrk(p, WHITESPACE);
408 r = safe_atou(p, &maj);
415 w += strspn(w, WHITESPACE);
417 if (fnmatch(name, w, 0) != 0)
426 r = cg_set_attribute("devices", path, "devices.allow", buf);
428 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
429 "Failed to set devices.allow on %s: %m", path);
435 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
438 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
439 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
440 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
443 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
444 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
445 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
448 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
449 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
450 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
451 return c->startup_cpu_weight;
452 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
453 return c->cpu_weight;
455 return CGROUP_WEIGHT_DEFAULT;
458 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
459 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
460 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
461 return c->startup_cpu_shares;
462 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
463 return c->cpu_shares;
465 return CGROUP_CPU_SHARES_DEFAULT;
468 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
469 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
472 xsprintf(buf, "%" PRIu64 "\n", weight);
473 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
475 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
476 "Failed to set cpu.weight: %m");
478 if (quota != USEC_INFINITY)
479 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
480 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
482 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
484 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
487 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
488 "Failed to set cpu.max: %m");
491 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
492 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
495 xsprintf(buf, "%" PRIu64 "\n", shares);
496 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
498 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
499 "Failed to set cpu.shares: %m");
501 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
502 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
504 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
505 "Failed to set cpu.cfs_period_us: %m");
507 if (quota != USEC_INFINITY) {
508 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
509 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
511 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
513 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
514 "Failed to set cpu.cfs_quota_us: %m");
517 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
518 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
519 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
522 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
523 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
524 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
527 static bool cgroup_context_has_io_config(CGroupContext *c) {
528 return c->io_accounting ||
529 c->io_weight != CGROUP_WEIGHT_INVALID ||
530 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
531 c->io_device_weights ||
535 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
536 return c->blockio_accounting ||
537 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
538 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
539 c->blockio_device_weights ||
540 c->blockio_device_bandwidths;
543 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
544 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
545 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
546 return c->startup_io_weight;
547 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
550 return CGROUP_WEIGHT_DEFAULT;
553 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
554 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
555 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
556 return c->startup_blockio_weight;
557 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
558 return c->blockio_weight;
560 return CGROUP_BLKIO_WEIGHT_DEFAULT;
563 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
564 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
565 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
568 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
569 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
570 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
573 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
574 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
578 r = lookup_block_device(dev_path, &dev);
582 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
583 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
585 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
586 "Failed to set io.weight: %m");
589 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
590 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
594 r = lookup_block_device(dev_path, &dev);
598 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
599 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
601 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
602 "Failed to set blkio.weight_device: %m");
605 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
606 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
607 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
608 CGroupIOLimitType type;
613 r = lookup_block_device(dev_path, &dev);
617 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
618 if (limits[type] != cgroup_io_limit_defaults[type]) {
619 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
622 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
626 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
627 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
628 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
629 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
631 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
632 "Failed to set io.max: %m");
636 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
637 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
642 r = lookup_block_device(dev_path, &dev);
646 if (rbps != CGROUP_LIMIT_MAX)
648 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
649 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
651 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
652 "Failed to set blkio.throttle.read_bps_device: %m");
654 if (wbps != CGROUP_LIMIT_MAX)
656 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
657 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
659 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
660 "Failed to set blkio.throttle.write_bps_device: %m");
665 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
666 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
669 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
670 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
673 if (v != CGROUP_LIMIT_MAX)
674 xsprintf(buf, "%" PRIu64 "\n", v);
676 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
678 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
679 "Failed to set %s: %m", file);
682 static void cgroup_apply_firewall(Unit *u) {
687 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
688 * not recursive we don't ever touch the bpf on them */
691 r = bpf_firewall_compile(u);
695 (void) bpf_firewall_install(u);
699 static void cgroup_context_apply(
701 CGroupMask apply_mask,
703 ManagerState state) {
712 c = unit_get_cgroup_context(u);
713 path = u->cgroup_path;
718 /* Nothing to do? Exit early! */
719 if (apply_mask == 0 && !apply_bpf)
722 /* Some cgroup attributes are not supported on the root cgroup,
723 * hence silently ignore */
724 is_root = isempty(path) || path_equal(path, "/");
726 /* Make sure we don't try to display messages with an empty path. */
729 /* We generally ignore errors caused by read-only mounted
730 * cgroup trees (assuming we are running in a container then),
731 * and missing cgroups, i.e. EROFS and ENOENT. */
733 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
734 bool has_weight, has_shares;
736 has_weight = cgroup_context_has_cpu_weight(c);
737 has_shares = cgroup_context_has_cpu_shares(c);
739 if (cg_all_unified() > 0) {
743 weight = cgroup_context_cpu_weight(c, state);
744 else if (has_shares) {
745 uint64_t shares = cgroup_context_cpu_shares(c, state);
747 weight = cgroup_cpu_shares_to_weight(shares);
749 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
750 shares, weight, path);
752 weight = CGROUP_WEIGHT_DEFAULT;
754 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
759 uint64_t weight = cgroup_context_cpu_weight(c, state);
761 shares = cgroup_cpu_weight_to_shares(weight);
763 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
764 weight, shares, path);
765 } else if (has_shares)
766 shares = cgroup_context_cpu_shares(c, state);
768 shares = CGROUP_CPU_SHARES_DEFAULT;
770 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
774 if (apply_mask & CGROUP_MASK_IO) {
775 bool has_io = cgroup_context_has_io_config(c);
776 bool has_blockio = cgroup_context_has_blockio_config(c);
779 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
783 weight = cgroup_context_io_weight(c, state);
784 else if (has_blockio) {
785 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
787 weight = cgroup_weight_blkio_to_io(blkio_weight);
789 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
790 blkio_weight, weight);
792 weight = CGROUP_WEIGHT_DEFAULT;
794 xsprintf(buf, "default %" PRIu64 "\n", weight);
795 r = cg_set_attribute("io", path, "io.weight", buf);
797 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
798 "Failed to set io.weight: %m");
801 CGroupIODeviceWeight *w;
803 /* FIXME: no way to reset this list */
804 LIST_FOREACH(device_weights, w, c->io_device_weights)
805 cgroup_apply_io_device_weight(u, w->path, w->weight);
806 } else if (has_blockio) {
807 CGroupBlockIODeviceWeight *w;
809 /* FIXME: no way to reset this list */
810 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
811 weight = cgroup_weight_blkio_to_io(w->weight);
813 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
814 w->weight, weight, w->path);
816 cgroup_apply_io_device_weight(u, w->path, weight);
821 /* Apply limits and free ones without config. */
823 CGroupIODeviceLimit *l, *next;
825 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
826 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
827 cgroup_context_free_io_device_limit(c, l);
829 } else if (has_blockio) {
830 CGroupBlockIODeviceBandwidth *b, *next;
832 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
833 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
834 CGroupIOLimitType type;
836 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
837 limits[type] = cgroup_io_limit_defaults[type];
839 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
840 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
842 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
843 b->rbps, b->wbps, b->path);
845 if (!cgroup_apply_io_device_limit(u, b->path, limits))
846 cgroup_context_free_blockio_device_bandwidth(c, b);
851 if (apply_mask & CGROUP_MASK_BLKIO) {
852 bool has_io = cgroup_context_has_io_config(c);
853 bool has_blockio = cgroup_context_has_blockio_config(c);
856 char buf[DECIMAL_STR_MAX(uint64_t)+1];
860 uint64_t io_weight = cgroup_context_io_weight(c, state);
862 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
864 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
866 } else if (has_blockio)
867 weight = cgroup_context_blkio_weight(c, state);
869 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
871 xsprintf(buf, "%" PRIu64 "\n", weight);
872 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
874 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
875 "Failed to set blkio.weight: %m");
878 CGroupIODeviceWeight *w;
880 /* FIXME: no way to reset this list */
881 LIST_FOREACH(device_weights, w, c->io_device_weights) {
882 weight = cgroup_weight_io_to_blkio(w->weight);
884 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
885 w->weight, weight, w->path);
887 cgroup_apply_blkio_device_weight(u, w->path, weight);
889 } else if (has_blockio) {
890 CGroupBlockIODeviceWeight *w;
892 /* FIXME: no way to reset this list */
893 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
894 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
898 /* Apply limits and free ones without config. */
900 CGroupIODeviceLimit *l, *next;
902 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
903 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
904 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
906 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
907 cgroup_context_free_io_device_limit(c, l);
909 } else if (has_blockio) {
910 CGroupBlockIODeviceBandwidth *b, *next;
912 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
913 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
914 cgroup_context_free_blockio_device_bandwidth(c, b);
918 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
919 if (cg_all_unified() > 0) {
920 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
922 if (cgroup_context_has_unified_memory_config(c)) {
924 swap_max = c->memory_swap_max;
926 max = c->memory_limit;
928 if (max != CGROUP_LIMIT_MAX)
929 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
932 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
933 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
934 cgroup_apply_unified_memory_limit(u, "memory.max", max);
935 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
937 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
940 if (cgroup_context_has_unified_memory_config(c)) {
942 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
944 val = c->memory_limit;
946 if (val == CGROUP_LIMIT_MAX)
947 strncpy(buf, "-1\n", sizeof(buf));
949 xsprintf(buf, "%" PRIu64 "\n", val);
951 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
953 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
954 "Failed to set memory.limit_in_bytes: %m");
958 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
959 CGroupDeviceAllow *a;
961 /* Changing the devices list of a populated cgroup
962 * might result in EINVAL, hence ignore EINVAL
965 if (c->device_allow || c->device_policy != CGROUP_AUTO)
966 r = cg_set_attribute("devices", path, "devices.deny", "a");
968 r = cg_set_attribute("devices", path, "devices.allow", "a");
970 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
971 "Failed to reset devices.list: %m");
973 if (c->device_policy == CGROUP_CLOSED ||
974 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
975 static const char auto_devices[] =
976 "/dev/null\0" "rwm\0"
977 "/dev/zero\0" "rwm\0"
978 "/dev/full\0" "rwm\0"
979 "/dev/random\0" "rwm\0"
980 "/dev/urandom\0" "rwm\0"
982 "/dev/ptmx\0" "rwm\0"
983 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
984 "-/run/systemd/inaccessible/chr\0" "rwm\0"
985 "-/run/systemd/inaccessible/blk\0" "rwm\0";
989 NULSTR_FOREACH_PAIR(x, y, auto_devices)
990 whitelist_device(path, x, y);
992 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
993 whitelist_major(path, "pts", 'c', "rw");
996 LIST_FOREACH(device_allow, a, c->device_allow) {
1012 if (path_startswith(a->path, "/dev/"))
1013 whitelist_device(path, a->path, acc);
1014 else if ((val = startswith(a->path, "block-")))
1015 whitelist_major(path, val, 'b', acc);
1016 else if ((val = startswith(a->path, "char-")))
1017 whitelist_major(path, val, 'c', acc);
1019 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1023 if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1025 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1026 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1028 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1029 r = cg_set_attribute("pids", path, "pids.max", buf);
1031 r = cg_set_attribute("pids", path, "pids.max", "max");
1034 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1035 "Failed to set pids.max: %m");
1039 cgroup_apply_firewall(u);
1042 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1043 CGroupMask mask = 0;
1045 /* Figure out which controllers we need */
1047 if (c->cpu_accounting ||
1048 cgroup_context_has_cpu_weight(c) ||
1049 cgroup_context_has_cpu_shares(c) ||
1050 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1051 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1053 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1054 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1056 if (c->memory_accounting ||
1057 c->memory_limit != CGROUP_LIMIT_MAX ||
1058 cgroup_context_has_unified_memory_config(c))
1059 mask |= CGROUP_MASK_MEMORY;
1061 if (c->device_allow ||
1062 c->device_policy != CGROUP_AUTO)
1063 mask |= CGROUP_MASK_DEVICES;
1065 if (c->tasks_accounting ||
1066 c->tasks_max != (uint64_t) -1)
1067 mask |= CGROUP_MASK_PIDS;
1072 CGroupMask unit_get_own_mask(Unit *u) {
1075 /* Returns the mask of controllers the unit needs for itself */
1077 c = unit_get_cgroup_context(u);
1081 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1084 CGroupMask unit_get_delegate_mask(Unit *u) {
1087 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1088 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1090 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1092 if (u->type == UNIT_SLICE)
1095 c = unit_get_cgroup_context(u);
1102 if (cg_all_unified() <= 0) {
1105 e = unit_get_exec_context(u);
1106 if (e && !exec_context_maintains_privileges(e))
1110 return c->delegate_controllers;
1113 CGroupMask unit_get_members_mask(Unit *u) {
1116 /* Returns the mask of controllers all of the unit's children require, merged */
1118 if (u->cgroup_members_mask_valid)
1119 return u->cgroup_members_mask;
1121 u->cgroup_members_mask = 0;
1123 if (u->type == UNIT_SLICE) {
1128 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1133 if (UNIT_DEREF(member->slice) != u)
1136 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1140 u->cgroup_members_mask_valid = true;
1141 return u->cgroup_members_mask;
1144 CGroupMask unit_get_siblings_mask(Unit *u) {
1147 /* Returns the mask of controllers all of the unit's siblings
1148 * require, i.e. the members mask of the unit's parent slice
1149 * if there is one. */
1151 if (UNIT_ISSET(u->slice))
1152 return unit_get_members_mask(UNIT_DEREF(u->slice));
1154 return unit_get_subtree_mask(u); /* we are the top-level slice */
1157 CGroupMask unit_get_subtree_mask(Unit *u) {
1159 /* Returns the mask of this subtree, meaning of the group
1160 * itself and its children. */
1162 return unit_get_own_mask(u) | unit_get_members_mask(u);
1165 CGroupMask unit_get_target_mask(Unit *u) {
1168 /* This returns the cgroup mask of all controllers to enable
1169 * for a specific cgroup, i.e. everything it needs itself,
1170 * plus all that its children need, plus all that its siblings
1171 * need. This is primarily useful on the legacy cgroup
1172 * hierarchy, where we need to duplicate each cgroup in each
1173 * hierarchy that shall be enabled for it. */
1175 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1176 mask &= u->manager->cgroup_supported;
1181 CGroupMask unit_get_enable_mask(Unit *u) {
1184 /* This returns the cgroup mask of all controllers to enable
1185 * for the children of a specific cgroup. This is primarily
1186 * useful for the unified cgroup hierarchy, where each cgroup
1187 * controls which controllers are enabled for its children. */
1189 mask = unit_get_members_mask(u);
1190 mask &= u->manager->cgroup_supported;
1195 bool unit_get_needs_bpf(Unit *u) {
1200 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1202 if (u->type == UNIT_SLICE)
1205 c = unit_get_cgroup_context(u);
1209 if (c->ip_accounting ||
1210 c->ip_address_allow ||
1214 /* If any parent slice has an IP access list defined, it applies too */
1215 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1216 c = unit_get_cgroup_context(p);
1220 if (c->ip_address_allow ||
1228 /* Recurse from a unit up through its containing slices, propagating
1229 * mask bits upward. A unit is also member of itself. */
1230 void unit_update_cgroup_members_masks(Unit *u) {
1236 /* Calculate subtree mask */
1237 m = unit_get_subtree_mask(u);
1239 /* See if anything changed from the previous invocation. If
1240 * not, we're done. */
1241 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1245 u->cgroup_subtree_mask_valid &&
1246 ((m & ~u->cgroup_subtree_mask) != 0) &&
1247 ((~m & u->cgroup_subtree_mask) == 0);
1249 u->cgroup_subtree_mask = m;
1250 u->cgroup_subtree_mask_valid = true;
1252 if (UNIT_ISSET(u->slice)) {
1253 Unit *s = UNIT_DEREF(u->slice);
1256 /* There's more set now than before. We
1257 * propagate the new mask to the parent's mask
1258 * (not caring if it actually was valid or
1261 s->cgroup_members_mask |= m;
1264 /* There's less set now than before (or we
1265 * don't know), we need to recalculate
1266 * everything, so let's invalidate the
1267 * parent's members mask */
1269 s->cgroup_members_mask_valid = false;
1271 /* And now make sure that this change also hits our
1273 unit_update_cgroup_members_masks(s);
1277 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1284 if (u->cgroup_path &&
1285 u->cgroup_realized &&
1286 (u->cgroup_realized_mask & mask) == mask)
1287 return u->cgroup_path;
1289 u = UNIT_DEREF(u->slice);
1295 char *unit_default_cgroup_path(Unit *u) {
1296 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1301 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1302 return strdup(u->manager->cgroup_root);
1304 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1305 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1310 escaped = cg_escape(u->id);
1315 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1318 return strjoin(u->manager->cgroup_root, "/", escaped);
1321 int unit_set_cgroup_path(Unit *u, const char *path) {
1322 _cleanup_free_ char *p = NULL;
1334 if (streq_ptr(u->cgroup_path, p))
1338 r = hashmap_put(u->manager->cgroup_unit, p, u);
1343 unit_release_cgroup(u);
1351 int unit_watch_cgroup(Unit *u) {
1352 _cleanup_free_ char *events = NULL;
1357 if (!u->cgroup_path)
1360 if (u->cgroup_inotify_wd >= 0)
1363 /* Only applies to the unified hierarchy */
1364 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1366 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1370 /* Don't watch the root slice, it's pointless. */
1371 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1374 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1378 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1382 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1383 if (u->cgroup_inotify_wd < 0) {
1385 if (errno == ENOENT) /* If the directory is already
1386 * gone we don't need to track
1387 * it, so this is not an error */
1390 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1393 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1395 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1400 int unit_pick_cgroup_path(Unit *u) {
1401 _cleanup_free_ char *path = NULL;
1409 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1412 path = unit_default_cgroup_path(u);
1416 r = unit_set_cgroup_path(u, path);
1418 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1420 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1425 static int unit_create_cgroup(
1427 CGroupMask target_mask,
1428 CGroupMask enable_mask,
1436 c = unit_get_cgroup_context(u);
1440 /* Figure out our cgroup path */
1441 r = unit_pick_cgroup_path(u);
1445 /* First, create our own group */
1446 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1448 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1450 /* Start watching it */
1451 (void) unit_watch_cgroup(u);
1453 /* Enable all controllers we need */
1454 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1456 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1458 /* Keep track that this is now realized */
1459 u->cgroup_realized = true;
1460 u->cgroup_realized_mask = target_mask;
1461 u->cgroup_enabled_mask = enable_mask;
1462 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1464 if (u->type != UNIT_SLICE && !c->delegate) {
1466 /* Then, possibly move things over, but not if
1467 * subgroups may contain processes, which is the case
1468 * for slice and delegation units. */
1469 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1471 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1477 int unit_attach_pids_to_cgroup(Unit *u) {
1481 r = unit_realize_cgroup(u);
1485 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1492 static void cgroup_xattr_apply(Unit *u) {
1493 char ids[SD_ID128_STRING_MAX];
1498 if (!MANAGER_IS_SYSTEM(u->manager))
1501 if (sd_id128_is_null(u->invocation_id))
1504 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1505 "trusted.invocation_id",
1506 sd_id128_to_string(u->invocation_id, ids), 32,
1509 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1512 static bool unit_has_mask_realized(
1514 CGroupMask target_mask,
1515 CGroupMask enable_mask,
1520 return u->cgroup_realized &&
1521 u->cgroup_realized_mask == target_mask &&
1522 u->cgroup_enabled_mask == enable_mask &&
1523 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1524 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1527 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1530 if (u->in_cgroup_realize_queue)
1533 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1534 u->in_cgroup_realize_queue = true;
1537 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1540 if (!u->in_cgroup_realize_queue)
1543 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1544 u->in_cgroup_realize_queue = false;
1548 /* Check if necessary controllers and attributes for a unit are in place.
1550 * If so, do nothing.
1551 * If not, create paths, move processes over, and set attributes.
1553 * Returns 0 on success and < 0 on failure. */
1554 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1555 CGroupMask target_mask, enable_mask;
1556 bool needs_bpf, apply_bpf;
1561 unit_remove_from_cgroup_realize_queue(u);
1563 target_mask = unit_get_target_mask(u);
1564 enable_mask = unit_get_enable_mask(u);
1565 needs_bpf = unit_get_needs_bpf(u);
1567 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1570 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1571 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1572 * this will trickle down properly to cgroupfs. */
1573 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1575 /* First, realize parents */
1576 if (UNIT_ISSET(u->slice)) {
1577 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1582 /* And then do the real work */
1583 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1587 /* Finally, apply the necessary attributes. */
1588 cgroup_context_apply(u, target_mask, apply_bpf, state);
1589 cgroup_xattr_apply(u);
1594 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1602 state = manager_state(m);
1604 while ((i = m->cgroup_realize_queue)) {
1605 assert(i->in_cgroup_realize_queue);
1607 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1608 /* Maybe things changed, and the unit is not actually active anymore? */
1609 unit_remove_from_cgroup_realize_queue(i);
1613 r = unit_realize_cgroup_now(i, state);
1615 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1623 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1626 /* This adds the siblings of the specified unit and the
1627 * siblings of all parent units to the cgroup queue. (But
1628 * neither the specified unit itself nor the parents.) */
1630 while ((slice = UNIT_DEREF(u->slice))) {
1635 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1639 /* Skip units that have a dependency on the slice
1640 * but aren't actually in it. */
1641 if (UNIT_DEREF(m->slice) != slice)
1644 /* No point in doing cgroup application for units
1645 * without active processes. */
1646 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1649 /* If the unit doesn't need any new controllers
1650 * and has current ones realized, it doesn't need
1652 if (unit_has_mask_realized(m,
1653 unit_get_target_mask(m),
1654 unit_get_enable_mask(m),
1655 unit_get_needs_bpf(m)))
1658 unit_add_to_cgroup_realize_queue(m);
1665 int unit_realize_cgroup(Unit *u) {
1668 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1671 /* So, here's the deal: when realizing the cgroups for this
1672 * unit, we need to first create all parents, but there's more
1673 * actually: for the weight-based controllers we also need to
1674 * make sure that all our siblings (i.e. units that are in the
1675 * same slice as we are) have cgroups, too. Otherwise, things
1676 * would become very uneven as each of their processes would
1677 * get as much resources as all our group together. This call
1678 * will synchronously create the parent cgroups, but will
1679 * defer work on the siblings to the next event loop
1682 /* Add all sibling slices to the cgroup queue. */
1683 unit_add_siblings_to_cgroup_realize_queue(u);
1685 /* And realize this one now (and apply the values) */
1686 return unit_realize_cgroup_now(u, manager_state(u->manager));
1689 void unit_release_cgroup(Unit *u) {
1692 /* Forgets all cgroup details for this cgroup */
1694 if (u->cgroup_path) {
1695 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1696 u->cgroup_path = mfree(u->cgroup_path);
1699 if (u->cgroup_inotify_wd >= 0) {
1700 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1701 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1703 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1704 u->cgroup_inotify_wd = -1;
1708 void unit_prune_cgroup(Unit *u) {
1714 /* Removes the cgroup, if empty and possible, and stops watching it. */
1716 if (!u->cgroup_path)
1719 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1721 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1723 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1725 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1732 unit_release_cgroup(u);
1734 u->cgroup_realized = false;
1735 u->cgroup_realized_mask = 0;
1736 u->cgroup_enabled_mask = 0;
1739 int unit_search_main_pid(Unit *u, pid_t *ret) {
1740 _cleanup_fclose_ FILE *f = NULL;
1741 pid_t pid = 0, npid, mypid;
1747 if (!u->cgroup_path)
1750 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1754 mypid = getpid_cached();
1755 while (cg_read_pid(f, &npid) > 0) {
1761 /* Ignore processes that aren't our kids */
1762 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1766 /* Dang, there's more than one daemonized PID
1767 in this group, so we don't know what process
1768 is the main process. */
1779 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1780 _cleanup_closedir_ DIR *d = NULL;
1781 _cleanup_fclose_ FILE *f = NULL;
1787 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1793 while ((r = cg_read_pid(f, &pid)) > 0) {
1794 r = unit_watch_pid(u, pid);
1795 if (r < 0 && ret >= 0)
1799 if (r < 0 && ret >= 0)
1803 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1810 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1811 _cleanup_free_ char *p = NULL;
1813 p = strjoin(path, "/", fn);
1819 r = unit_watch_pids_in_path(u, p);
1820 if (r < 0 && ret >= 0)
1824 if (r < 0 && ret >= 0)
1831 int unit_synthesize_cgroup_empty_event(Unit *u) {
1836 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1837 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1838 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1840 if (!u->cgroup_path)
1843 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1846 if (r > 0) /* On unified we have reliable notifications, and don't need this */
1849 if (!set_isempty(u->pids))
1852 unit_add_to_cgroup_empty_queue(u);
1856 int unit_watch_all_pids(Unit *u) {
1861 /* Adds all PIDs from our cgroup to the set of PIDs we
1862 * watch. This is a fallback logic for cases where we do not
1863 * get reliable cgroup empty notifications: we try to use
1864 * SIGCHLD as replacement. */
1866 if (!u->cgroup_path)
1869 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1872 if (r > 0) /* On unified we can use proper notifications */
1875 return unit_watch_pids_in_path(u, u->cgroup_path);
1878 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1879 Manager *m = userdata;
1886 u = m->cgroup_empty_queue;
1890 assert(u->in_cgroup_empty_queue);
1891 u->in_cgroup_empty_queue = false;
1892 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1894 if (m->cgroup_empty_queue) {
1895 /* More stuff queued, let's make sure we remain enabled */
1896 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1898 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1901 unit_add_to_gc_queue(u);
1903 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1904 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1909 void unit_add_to_cgroup_empty_queue(Unit *u) {
1914 /* Note that there are four different ways how cgroup empty events reach us:
1916 * 1. On the unified hierarchy we get an inotify event on the cgroup
1918 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1920 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1922 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1923 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1925 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1926 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1927 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1928 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1929 * case for scope units). */
1931 if (u->in_cgroup_empty_queue)
1934 /* Let's verify that the cgroup is really empty */
1935 if (!u->cgroup_path)
1937 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1939 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1945 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1946 u->in_cgroup_empty_queue = true;
1948 /* Trigger the defer event */
1949 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1951 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1954 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1955 Manager *m = userdata;
1962 union inotify_event_buffer buffer;
1963 struct inotify_event *e;
1966 l = read(fd, &buffer, sizeof(buffer));
1968 if (IN_SET(errno, EINTR, EAGAIN))
1971 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1974 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1978 /* Queue overflow has no watch descriptor */
1981 if (e->mask & IN_IGNORED)
1982 /* The watch was just removed */
1985 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1986 if (!u) /* Not that inotify might deliver
1987 * events for a watch even after it
1988 * was removed, because it was queued
1989 * before the removal. Let's ignore
1990 * this here safely. */
1993 unit_add_to_cgroup_empty_queue(u);
1999 int manager_setup_cgroup(Manager *m) {
2000 _cleanup_free_ char *path = NULL;
2001 const char *scope_path;
2004 #if 0 /// UNNEEDED by elogind
2010 /* 1. Determine hierarchy */
2011 m->cgroup_root = mfree(m->cgroup_root);
2012 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2013 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2015 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2018 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2020 #if 0 /// elogind does not support systemd scopes and slices
2021 /* Chop off the init scope, if we are already located in it */
2022 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2024 /* LEGACY: Also chop off the system slice if we are in
2025 * it. This is to support live upgrades from older systemd
2026 * versions where PID 1 was moved there. Also see
2027 * cg_get_root_path(). */
2028 if (!e && MANAGER_IS_SYSTEM(m)) {
2029 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2031 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2037 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2038 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2039 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2040 * easily prepend it everywhere. */
2041 delete_trailing_chars(m->cgroup_root, "/");
2044 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2046 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2048 r = cg_unified_flush();
2050 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2052 all_unified = cg_all_unified();
2053 if (all_unified < 0)
2054 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2055 if (all_unified > 0)
2056 log_debug("Unified cgroup hierarchy is located at %s.", path);
2058 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2060 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2062 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2064 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2067 #if 0 /// elogind is not init, and does not install the agent here.
2068 /* 3. Allocate cgroup empty defer event source */
2069 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2070 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2072 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2074 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2076 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2078 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2080 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2082 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2084 /* 4. Install notifier inotify object, or agent */
2085 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2087 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2089 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2090 safe_close(m->cgroup_inotify_fd);
2092 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2093 if (m->cgroup_inotify_fd < 0)
2094 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2096 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2098 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2100 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2101 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2102 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2104 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2106 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2108 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2110 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2111 * since it does not generate events when control groups with children run empty. */
2113 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2115 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2117 log_debug("Installed release agent.");
2119 log_debug("Release agent already installed.");
2122 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2123 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2124 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2127 * This method is in core, and normally called by systemd
2128 * being init. As elogind is never init, we can not install
2129 * our agent here. We do so when mounting our cgroup file
2130 * system, so only if elogind is its own tiny controller.
2131 * Further, elogind is not meant to run in systemd init scope. */
2132 if (MANAGER_IS_SYSTEM(m))
2133 // we are our own cgroup controller
2134 scope_path = strjoina("");
2135 else if (streq(m->cgroup_root, "/elogind"))
2136 // root already is our cgroup
2137 scope_path = strjoina(m->cgroup_root);
2139 // we have to create our own group
2140 scope_path = strjoina(m->cgroup_root, "/elogind");
2141 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2144 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2145 log_debug_elogind("Created control group \"%s\"", scope_path);
2147 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2148 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2149 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2151 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2154 /* 6. And pin it, so that it cannot be unmounted */
2155 safe_close(m->pin_cgroupfs_fd);
2156 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2157 if (m->pin_cgroupfs_fd < 0)
2158 return log_error_errno(errno, "Failed to open pin file: %m");
2160 /* 7. Always enable hierarchical support if it exists... */
2161 if (!all_unified && m->test_run_flags == 0)
2162 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2164 /* 8. Figure out which controllers are supported, and log about it */
2165 r = cg_mask_supported(&m->cgroup_supported);
2167 return log_error_errno(r, "Failed to determine supported controllers: %m");
2168 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2169 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2174 void manager_shutdown_cgroup(Manager *m, bool delete) {
2177 /* We can't really delete the group, since we are in it. But
2179 if (delete && m->cgroup_root)
2180 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2182 #if 0 /// elogind is not init
2183 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2185 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2187 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2188 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2191 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2193 m->cgroup_root = mfree(m->cgroup_root);
2196 #if 0 /// UNNEEDED by elogind
2197 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2204 u = hashmap_get(m->cgroup_unit, cgroup);
2208 p = strdupa(cgroup);
2212 e = strrchr(p, '/');
2214 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2218 u = hashmap_get(m->cgroup_unit, p);
2224 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2225 _cleanup_free_ char *cgroup = NULL;
2233 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2237 return manager_get_unit_by_cgroup(m, cgroup);
2240 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2248 if (pid == getpid_cached())
2249 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2251 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2255 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2259 return manager_get_unit_by_pid_cgroup(m, pid);
2263 #if 0 /// elogind must substitute this with its own variant
2264 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2270 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2271 * or from the --system instance */
2273 log_debug("Got cgroup empty notification for: %s", cgroup);
2275 u = manager_get_unit_by_cgroup(m, cgroup);
2279 unit_add_to_cgroup_empty_queue(u);
2283 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2289 log_debug("Got cgroup empty notification for: %s", cgroup);
2291 s = hashmap_get(m->sessions, cgroup);
2294 session_finalize(s);
2297 log_warning("Session not found: %s", cgroup);
2302 #if 0 /// UNNEEDED by elogind
2303 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2304 _cleanup_free_ char *v = NULL;
2310 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2313 if (!u->cgroup_path)
2316 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2319 r = cg_all_unified();
2323 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2325 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2331 return safe_atou64(v, ret);
2334 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2335 _cleanup_free_ char *v = NULL;
2341 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2344 if (!u->cgroup_path)
2347 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2350 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2356 return safe_atou64(v, ret);
2359 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2360 _cleanup_free_ char *v = NULL;
2367 if (!u->cgroup_path)
2370 r = cg_all_unified();
2374 const char *keys[] = { "usage_usec", NULL };
2375 _cleanup_free_ char *val = NULL;
2378 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2381 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2385 r = safe_atou64(val, &us);
2389 ns = us * NSEC_PER_USEC;
2391 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2394 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2400 r = safe_atou64(v, &ns);
2409 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2415 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2416 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2417 * call this function with a NULL return value. */
2419 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2422 r = unit_get_cpu_usage_raw(u, &ns);
2423 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2424 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2428 *ret = u->cpu_usage_last;
2434 if (ns > u->cpu_usage_base)
2435 ns -= u->cpu_usage_base;
2439 u->cpu_usage_last = ns;
2446 int unit_get_ip_accounting(
2448 CGroupIPAccountingMetric metric,
2455 assert(metric >= 0);
2456 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2459 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2460 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2461 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2463 if (u->type == UNIT_SLICE)
2466 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2469 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2470 u->ip_accounting_ingress_map_fd :
2471 u->ip_accounting_egress_map_fd;
2475 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2476 r = bpf_firewall_read_accounting(fd, &value, NULL);
2478 r = bpf_firewall_read_accounting(fd, NULL, &value);
2482 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2483 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2484 * ip_accounting_extra[] field, and add them in here transparently. */
2486 *ret = value + u->ip_accounting_extra[metric];
2491 int unit_reset_cpu_accounting(Unit *u) {
2497 u->cpu_usage_last = NSEC_INFINITY;
2499 r = unit_get_cpu_usage_raw(u, &ns);
2501 u->cpu_usage_base = 0;
2505 u->cpu_usage_base = ns;
2509 int unit_reset_ip_accounting(Unit *u) {
2514 if (u->ip_accounting_ingress_map_fd >= 0)
2515 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2517 if (u->ip_accounting_egress_map_fd >= 0)
2518 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2520 zero(u->ip_accounting_extra);
2522 return r < 0 ? r : q;
2525 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2528 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2534 /* always invalidate compat pairs together */
2535 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2536 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2538 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2539 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2541 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2544 u->cgroup_realized_mask &= ~m;
2545 unit_add_to_cgroup_realize_queue(u);
2548 void unit_invalidate_cgroup_bpf(Unit *u) {
2551 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2554 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2557 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2558 unit_add_to_cgroup_realize_queue(u);
2560 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2561 * list of our children includes our own. */
2562 if (u->type == UNIT_SLICE) {
2567 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2571 if (UNIT_DEREF(member->slice) != u)
2574 unit_invalidate_cgroup_bpf(member);
2579 void manager_invalidate_startup_units(Manager *m) {
2585 SET_FOREACH(u, m->startup_units, i)
2586 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2589 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2590 [CGROUP_AUTO] = "auto",
2591 [CGROUP_CLOSED] = "closed",
2592 [CGROUP_STRICT] = "strict",
2595 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);