1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 //#include "blockdev-util.h"
26 //#include "bpf-firewall.h"
27 #include "cgroup-util.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "process-util.h"
35 //#include "special.h"
36 #include "stdio-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
40 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
42 bool unit_has_root_cgroup(Unit *u) {
45 /* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
46 * as inside of containers the root slice won't be identical to the root cgroup. */
51 return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
54 #if 0 /// UNNEEDED by elogind
55 static void cgroup_compat_warn(void) {
56 static bool cgroup_compat_warned = false;
58 if (cgroup_compat_warned)
61 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
62 cgroup_compat_warned = true;
65 #define log_cgroup_compat(unit, fmt, ...) do { \
66 cgroup_compat_warn(); \
67 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
70 void cgroup_context_init(CGroupContext *c) {
73 /* Initialize everything to the kernel defaults, assuming the
74 * structure is preinitialized to 0 */
76 c->cpu_weight = CGROUP_WEIGHT_INVALID;
77 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
78 c->cpu_quota_per_sec_usec = USEC_INFINITY;
80 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
81 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
83 c->memory_high = CGROUP_LIMIT_MAX;
84 c->memory_max = CGROUP_LIMIT_MAX;
85 c->memory_swap_max = CGROUP_LIMIT_MAX;
87 c->memory_limit = CGROUP_LIMIT_MAX;
89 c->io_weight = CGROUP_WEIGHT_INVALID;
90 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
92 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
93 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
95 c->tasks_max = (uint64_t) -1;
98 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
102 LIST_REMOVE(device_allow, c->device_allow, a);
107 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
111 LIST_REMOVE(device_weights, c->io_device_weights, w);
116 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
120 LIST_REMOVE(device_limits, c->io_device_limits, l);
125 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
129 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
134 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
138 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
143 void cgroup_context_done(CGroupContext *c) {
146 while (c->io_device_weights)
147 cgroup_context_free_io_device_weight(c, c->io_device_weights);
149 while (c->io_device_limits)
150 cgroup_context_free_io_device_limit(c, c->io_device_limits);
152 while (c->blockio_device_weights)
153 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
155 while (c->blockio_device_bandwidths)
156 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
158 while (c->device_allow)
159 cgroup_context_free_device_allow(c, c->device_allow);
161 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
162 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
165 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
166 CGroupIODeviceLimit *il;
167 CGroupIODeviceWeight *iw;
168 CGroupBlockIODeviceBandwidth *b;
169 CGroupBlockIODeviceWeight *w;
170 CGroupDeviceAllow *a;
171 IPAddressAccessItem *iaai;
172 char u[FORMAT_TIMESPAN_MAX];
177 prefix = strempty(prefix);
180 "%sCPUAccounting=%s\n"
181 "%sIOAccounting=%s\n"
182 "%sBlockIOAccounting=%s\n"
183 "%sMemoryAccounting=%s\n"
184 "%sTasksAccounting=%s\n"
185 "%sIPAccounting=%s\n"
186 "%sCPUWeight=%" PRIu64 "\n"
187 "%sStartupCPUWeight=%" PRIu64 "\n"
188 "%sCPUShares=%" PRIu64 "\n"
189 "%sStartupCPUShares=%" PRIu64 "\n"
190 "%sCPUQuotaPerSecSec=%s\n"
191 "%sIOWeight=%" PRIu64 "\n"
192 "%sStartupIOWeight=%" PRIu64 "\n"
193 "%sBlockIOWeight=%" PRIu64 "\n"
194 "%sStartupBlockIOWeight=%" PRIu64 "\n"
195 "%sMemoryLow=%" PRIu64 "\n"
196 "%sMemoryHigh=%" PRIu64 "\n"
197 "%sMemoryMax=%" PRIu64 "\n"
198 "%sMemorySwapMax=%" PRIu64 "\n"
199 "%sMemoryLimit=%" PRIu64 "\n"
200 "%sTasksMax=%" PRIu64 "\n"
201 "%sDevicePolicy=%s\n"
203 prefix, yes_no(c->cpu_accounting),
204 prefix, yes_no(c->io_accounting),
205 prefix, yes_no(c->blockio_accounting),
206 prefix, yes_no(c->memory_accounting),
207 prefix, yes_no(c->tasks_accounting),
208 prefix, yes_no(c->ip_accounting),
209 prefix, c->cpu_weight,
210 prefix, c->startup_cpu_weight,
211 prefix, c->cpu_shares,
212 prefix, c->startup_cpu_shares,
213 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
214 prefix, c->io_weight,
215 prefix, c->startup_io_weight,
216 prefix, c->blockio_weight,
217 prefix, c->startup_blockio_weight,
218 prefix, c->memory_low,
219 prefix, c->memory_high,
220 prefix, c->memory_max,
221 prefix, c->memory_swap_max,
222 prefix, c->memory_limit,
223 prefix, c->tasks_max,
224 prefix, cgroup_device_policy_to_string(c->device_policy),
225 prefix, yes_no(c->delegate));
228 _cleanup_free_ char *t = NULL;
230 (void) cg_mask_to_string(c->delegate_controllers, &t);
232 fprintf(f, "%sDelegateControllers=%s\n",
237 LIST_FOREACH(device_allow, a, c->device_allow)
239 "%sDeviceAllow=%s %s%s%s\n",
242 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
244 LIST_FOREACH(device_weights, iw, c->io_device_weights)
246 "%sIODeviceWeight=%s %" PRIu64,
251 LIST_FOREACH(device_limits, il, c->io_device_limits) {
252 char buf[FORMAT_BYTES_MAX];
253 CGroupIOLimitType type;
255 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
256 if (il->limits[type] != cgroup_io_limit_defaults[type])
260 cgroup_io_limit_type_to_string(type),
262 format_bytes(buf, sizeof(buf), il->limits[type]));
265 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
267 "%sBlockIODeviceWeight=%s %" PRIu64,
272 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
273 char buf[FORMAT_BYTES_MAX];
275 if (b->rbps != CGROUP_LIMIT_MAX)
277 "%sBlockIOReadBandwidth=%s %s\n",
280 format_bytes(buf, sizeof(buf), b->rbps));
281 if (b->wbps != CGROUP_LIMIT_MAX)
283 "%sBlockIOWriteBandwidth=%s %s\n",
286 format_bytes(buf, sizeof(buf), b->wbps));
289 LIST_FOREACH(items, iaai, c->ip_address_allow) {
290 _cleanup_free_ char *k = NULL;
292 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
293 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
296 LIST_FOREACH(items, iaai, c->ip_address_deny) {
297 _cleanup_free_ char *k = NULL;
299 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
300 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
304 static int lookup_block_device(const char *p, dev_t *dev) {
313 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
315 if (S_ISBLK(st.st_mode))
317 else if (major(st.st_dev) != 0) {
318 /* If this is not a device node then find the block
319 * device this file is stored on */
322 /* If this is a partition, try to get the originating
324 (void) block_get_whole_disk(*dev, dev);
326 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
333 static int whitelist_device(const char *path, const char *node, const char *acc) {
334 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
336 bool ignore_notfound;
342 if (node[0] == '-') {
343 /* Non-existent paths starting with "-" must be silently ignored */
345 ignore_notfound = true;
347 ignore_notfound = false;
349 if (stat(node, &st) < 0) {
350 if (errno == ENOENT && ignore_notfound)
353 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
356 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
357 log_warning("%s is not a device.", node);
363 S_ISCHR(st.st_mode) ? 'c' : 'b',
364 major(st.st_rdev), minor(st.st_rdev),
367 r = cg_set_attribute("devices", path, "devices.allow", buf);
369 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
370 "Failed to set devices.allow on %s: %m", path);
375 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
376 _cleanup_fclose_ FILE *f = NULL;
383 assert(IN_SET(type, 'b', 'c'));
385 f = fopen("/proc/devices", "re");
387 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
389 FOREACH_LINE(line, f, goto fail) {
390 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
395 if (type == 'c' && streq(line, "Character devices:")) {
400 if (type == 'b' && streq(line, "Block devices:")) {
415 w = strpbrk(p, WHITESPACE);
420 r = safe_atou(p, &maj);
427 w += strspn(w, WHITESPACE);
429 if (fnmatch(name, w, 0) != 0)
438 r = cg_set_attribute("devices", path, "devices.allow", buf);
440 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
441 "Failed to set devices.allow on %s: %m", path);
447 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
450 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
451 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
452 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
455 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
456 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
457 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
460 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
461 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
462 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
463 return c->startup_cpu_weight;
464 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
465 return c->cpu_weight;
467 return CGROUP_WEIGHT_DEFAULT;
470 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
471 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
472 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
473 return c->startup_cpu_shares;
474 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
475 return c->cpu_shares;
477 return CGROUP_CPU_SHARES_DEFAULT;
480 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
481 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
484 xsprintf(buf, "%" PRIu64 "\n", weight);
485 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
487 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
488 "Failed to set cpu.weight: %m");
490 if (quota != USEC_INFINITY)
491 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
492 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
494 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
496 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
499 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
500 "Failed to set cpu.max: %m");
503 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
504 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
507 xsprintf(buf, "%" PRIu64 "\n", shares);
508 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
510 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
511 "Failed to set cpu.shares: %m");
513 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
514 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
516 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
517 "Failed to set cpu.cfs_period_us: %m");
519 if (quota != USEC_INFINITY) {
520 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
521 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
523 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
525 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
526 "Failed to set cpu.cfs_quota_us: %m");
529 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
530 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
531 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
534 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
535 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
536 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
539 static bool cgroup_context_has_io_config(CGroupContext *c) {
540 return c->io_accounting ||
541 c->io_weight != CGROUP_WEIGHT_INVALID ||
542 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
543 c->io_device_weights ||
547 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
548 return c->blockio_accounting ||
549 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
550 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
551 c->blockio_device_weights ||
552 c->blockio_device_bandwidths;
555 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
556 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
557 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
558 return c->startup_io_weight;
559 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
562 return CGROUP_WEIGHT_DEFAULT;
565 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
566 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
567 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
568 return c->startup_blockio_weight;
569 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
570 return c->blockio_weight;
572 return CGROUP_BLKIO_WEIGHT_DEFAULT;
575 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
576 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
577 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
580 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
581 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
582 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
585 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
586 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
590 r = lookup_block_device(dev_path, &dev);
594 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
595 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
597 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
598 "Failed to set io.weight: %m");
601 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
602 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
606 r = lookup_block_device(dev_path, &dev);
610 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
611 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
613 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
614 "Failed to set blkio.weight_device: %m");
617 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
618 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
619 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
620 CGroupIOLimitType type;
625 r = lookup_block_device(dev_path, &dev);
629 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
630 if (limits[type] != cgroup_io_limit_defaults[type]) {
631 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
634 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
638 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
639 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
640 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
641 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
643 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
644 "Failed to set io.max: %m");
648 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
649 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
654 r = lookup_block_device(dev_path, &dev);
658 if (rbps != CGROUP_LIMIT_MAX)
660 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
661 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
663 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
664 "Failed to set blkio.throttle.read_bps_device: %m");
666 if (wbps != CGROUP_LIMIT_MAX)
668 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
669 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
671 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
672 "Failed to set blkio.throttle.write_bps_device: %m");
677 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
678 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
681 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
682 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
685 if (v != CGROUP_LIMIT_MAX)
686 xsprintf(buf, "%" PRIu64 "\n", v);
688 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
690 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
691 "Failed to set %s: %m", file);
694 static void cgroup_apply_firewall(Unit *u) {
699 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
700 * not recursive we don't ever touch the bpf on them */
703 r = bpf_firewall_compile(u);
707 (void) bpf_firewall_install(u);
711 static void cgroup_context_apply(
713 CGroupMask apply_mask,
715 ManagerState state) {
724 /* Nothing to do? Exit early! */
725 if (apply_mask == 0 && !apply_bpf)
728 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
729 is_root = unit_has_root_cgroup(u);
731 assert_se(c = unit_get_cgroup_context(u));
732 assert_se(path = u->cgroup_path);
734 if (is_root) /* Make sure we don't try to display messages with an empty path. */
737 /* We generally ignore errors caused by read-only mounted
738 * cgroup trees (assuming we are running in a container then),
739 * and missing cgroups, i.e. EROFS and ENOENT. */
741 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
742 bool has_weight, has_shares;
744 has_weight = cgroup_context_has_cpu_weight(c);
745 has_shares = cgroup_context_has_cpu_shares(c);
747 if (cg_all_unified() > 0) {
751 weight = cgroup_context_cpu_weight(c, state);
752 else if (has_shares) {
753 uint64_t shares = cgroup_context_cpu_shares(c, state);
755 weight = cgroup_cpu_shares_to_weight(shares);
757 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
758 shares, weight, path);
760 weight = CGROUP_WEIGHT_DEFAULT;
762 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
767 uint64_t weight = cgroup_context_cpu_weight(c, state);
769 shares = cgroup_cpu_weight_to_shares(weight);
771 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
772 weight, shares, path);
773 } else if (has_shares)
774 shares = cgroup_context_cpu_shares(c, state);
776 shares = CGROUP_CPU_SHARES_DEFAULT;
778 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
782 if (apply_mask & CGROUP_MASK_IO) {
783 bool has_io = cgroup_context_has_io_config(c);
784 bool has_blockio = cgroup_context_has_blockio_config(c);
787 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
791 weight = cgroup_context_io_weight(c, state);
792 else if (has_blockio) {
793 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
795 weight = cgroup_weight_blkio_to_io(blkio_weight);
797 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
798 blkio_weight, weight);
800 weight = CGROUP_WEIGHT_DEFAULT;
802 xsprintf(buf, "default %" PRIu64 "\n", weight);
803 r = cg_set_attribute("io", path, "io.weight", buf);
805 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
806 "Failed to set io.weight: %m");
809 CGroupIODeviceWeight *w;
811 /* FIXME: no way to reset this list */
812 LIST_FOREACH(device_weights, w, c->io_device_weights)
813 cgroup_apply_io_device_weight(u, w->path, w->weight);
814 } else if (has_blockio) {
815 CGroupBlockIODeviceWeight *w;
817 /* FIXME: no way to reset this list */
818 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
819 weight = cgroup_weight_blkio_to_io(w->weight);
821 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
822 w->weight, weight, w->path);
824 cgroup_apply_io_device_weight(u, w->path, weight);
829 /* Apply limits and free ones without config. */
831 CGroupIODeviceLimit *l, *next;
833 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
834 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
835 cgroup_context_free_io_device_limit(c, l);
837 } else if (has_blockio) {
838 CGroupBlockIODeviceBandwidth *b, *next;
840 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
841 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
842 CGroupIOLimitType type;
844 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
845 limits[type] = cgroup_io_limit_defaults[type];
847 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
848 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
850 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
851 b->rbps, b->wbps, b->path);
853 if (!cgroup_apply_io_device_limit(u, b->path, limits))
854 cgroup_context_free_blockio_device_bandwidth(c, b);
859 if (apply_mask & CGROUP_MASK_BLKIO) {
860 bool has_io = cgroup_context_has_io_config(c);
861 bool has_blockio = cgroup_context_has_blockio_config(c);
864 char buf[DECIMAL_STR_MAX(uint64_t)+1];
868 uint64_t io_weight = cgroup_context_io_weight(c, state);
870 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
872 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
874 } else if (has_blockio)
875 weight = cgroup_context_blkio_weight(c, state);
877 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
879 xsprintf(buf, "%" PRIu64 "\n", weight);
880 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
882 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
883 "Failed to set blkio.weight: %m");
886 CGroupIODeviceWeight *w;
888 /* FIXME: no way to reset this list */
889 LIST_FOREACH(device_weights, w, c->io_device_weights) {
890 weight = cgroup_weight_io_to_blkio(w->weight);
892 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
893 w->weight, weight, w->path);
895 cgroup_apply_blkio_device_weight(u, w->path, weight);
897 } else if (has_blockio) {
898 CGroupBlockIODeviceWeight *w;
900 /* FIXME: no way to reset this list */
901 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
902 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
906 /* Apply limits and free ones without config. */
908 CGroupIODeviceLimit *l, *next;
910 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
911 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
912 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
914 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
915 cgroup_context_free_io_device_limit(c, l);
917 } else if (has_blockio) {
918 CGroupBlockIODeviceBandwidth *b, *next;
920 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
921 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
922 cgroup_context_free_blockio_device_bandwidth(c, b);
926 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
927 if (cg_all_unified() > 0) {
928 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
930 if (cgroup_context_has_unified_memory_config(c)) {
932 swap_max = c->memory_swap_max;
934 max = c->memory_limit;
936 if (max != CGROUP_LIMIT_MAX)
937 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
940 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
941 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
942 cgroup_apply_unified_memory_limit(u, "memory.max", max);
943 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
945 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
948 if (cgroup_context_has_unified_memory_config(c)) {
950 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
952 val = c->memory_limit;
954 if (val == CGROUP_LIMIT_MAX)
955 strncpy(buf, "-1\n", sizeof(buf));
957 xsprintf(buf, "%" PRIu64 "\n", val);
959 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
961 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
962 "Failed to set memory.limit_in_bytes: %m");
966 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
967 CGroupDeviceAllow *a;
969 /* Changing the devices list of a populated cgroup
970 * might result in EINVAL, hence ignore EINVAL
973 if (c->device_allow || c->device_policy != CGROUP_AUTO)
974 r = cg_set_attribute("devices", path, "devices.deny", "a");
976 r = cg_set_attribute("devices", path, "devices.allow", "a");
978 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
979 "Failed to reset devices.list: %m");
981 if (c->device_policy == CGROUP_CLOSED ||
982 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
983 static const char auto_devices[] =
984 "/dev/null\0" "rwm\0"
985 "/dev/zero\0" "rwm\0"
986 "/dev/full\0" "rwm\0"
987 "/dev/random\0" "rwm\0"
988 "/dev/urandom\0" "rwm\0"
990 "/dev/ptmx\0" "rwm\0"
991 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
992 "-/run/systemd/inaccessible/chr\0" "rwm\0"
993 "-/run/systemd/inaccessible/blk\0" "rwm\0";
997 NULSTR_FOREACH_PAIR(x, y, auto_devices)
998 whitelist_device(path, x, y);
1000 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1001 whitelist_major(path, "pts", 'c', "rw");
1004 LIST_FOREACH(device_allow, a, c->device_allow) {
1020 if (path_startswith(a->path, "/dev/"))
1021 whitelist_device(path, a->path, acc);
1022 else if ((val = startswith(a->path, "block-")))
1023 whitelist_major(path, val, 'b', acc);
1024 else if ((val = startswith(a->path, "char-")))
1025 whitelist_major(path, val, 'c', acc);
1027 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1031 if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) {
1033 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1034 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1036 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1037 r = cg_set_attribute("pids", path, "pids.max", buf);
1039 r = cg_set_attribute("pids", path, "pids.max", "max");
1042 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1043 "Failed to set pids.max: %m");
1047 cgroup_apply_firewall(u);
1050 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1051 CGroupMask mask = 0;
1053 /* Figure out which controllers we need */
1055 if (c->cpu_accounting ||
1056 cgroup_context_has_cpu_weight(c) ||
1057 cgroup_context_has_cpu_shares(c) ||
1058 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1059 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1061 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1062 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1064 if (c->memory_accounting ||
1065 c->memory_limit != CGROUP_LIMIT_MAX ||
1066 cgroup_context_has_unified_memory_config(c))
1067 mask |= CGROUP_MASK_MEMORY;
1069 if (c->device_allow ||
1070 c->device_policy != CGROUP_AUTO)
1071 mask |= CGROUP_MASK_DEVICES;
1073 if (c->tasks_accounting ||
1074 c->tasks_max != CGROUP_LIMIT_MAX)
1075 mask |= CGROUP_MASK_PIDS;
1080 CGroupMask unit_get_own_mask(Unit *u) {
1083 /* Returns the mask of controllers the unit needs for itself */
1085 c = unit_get_cgroup_context(u);
1089 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1092 CGroupMask unit_get_delegate_mask(Unit *u) {
1095 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1096 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1098 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1100 if (u->type == UNIT_SLICE)
1103 c = unit_get_cgroup_context(u);
1110 if (cg_all_unified() <= 0) {
1113 e = unit_get_exec_context(u);
1114 if (e && !exec_context_maintains_privileges(e))
1118 return c->delegate_controllers;
1121 CGroupMask unit_get_members_mask(Unit *u) {
1124 /* Returns the mask of controllers all of the unit's children require, merged */
1126 if (u->cgroup_members_mask_valid)
1127 return u->cgroup_members_mask;
1129 u->cgroup_members_mask = 0;
1131 if (u->type == UNIT_SLICE) {
1136 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1141 if (UNIT_DEREF(member->slice) != u)
1144 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1148 u->cgroup_members_mask_valid = true;
1149 return u->cgroup_members_mask;
1152 CGroupMask unit_get_siblings_mask(Unit *u) {
1155 /* Returns the mask of controllers all of the unit's siblings
1156 * require, i.e. the members mask of the unit's parent slice
1157 * if there is one. */
1159 if (UNIT_ISSET(u->slice))
1160 return unit_get_members_mask(UNIT_DEREF(u->slice));
1162 return unit_get_subtree_mask(u); /* we are the top-level slice */
1165 CGroupMask unit_get_subtree_mask(Unit *u) {
1167 /* Returns the mask of this subtree, meaning of the group
1168 * itself and its children. */
1170 return unit_get_own_mask(u) | unit_get_members_mask(u);
1173 CGroupMask unit_get_target_mask(Unit *u) {
1176 /* This returns the cgroup mask of all controllers to enable
1177 * for a specific cgroup, i.e. everything it needs itself,
1178 * plus all that its children need, plus all that its siblings
1179 * need. This is primarily useful on the legacy cgroup
1180 * hierarchy, where we need to duplicate each cgroup in each
1181 * hierarchy that shall be enabled for it. */
1183 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1184 mask &= u->manager->cgroup_supported;
1189 CGroupMask unit_get_enable_mask(Unit *u) {
1192 /* This returns the cgroup mask of all controllers to enable
1193 * for the children of a specific cgroup. This is primarily
1194 * useful for the unified cgroup hierarchy, where each cgroup
1195 * controls which controllers are enabled for its children. */
1197 mask = unit_get_members_mask(u);
1198 mask &= u->manager->cgroup_supported;
1203 bool unit_get_needs_bpf(Unit *u) {
1208 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1210 if (u->type == UNIT_SLICE)
1213 c = unit_get_cgroup_context(u);
1217 if (c->ip_accounting ||
1218 c->ip_address_allow ||
1222 /* If any parent slice has an IP access list defined, it applies too */
1223 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1224 c = unit_get_cgroup_context(p);
1228 if (c->ip_address_allow ||
1236 /* Recurse from a unit up through its containing slices, propagating
1237 * mask bits upward. A unit is also member of itself. */
1238 void unit_update_cgroup_members_masks(Unit *u) {
1244 /* Calculate subtree mask */
1245 m = unit_get_subtree_mask(u);
1247 /* See if anything changed from the previous invocation. If
1248 * not, we're done. */
1249 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1253 u->cgroup_subtree_mask_valid &&
1254 ((m & ~u->cgroup_subtree_mask) != 0) &&
1255 ((~m & u->cgroup_subtree_mask) == 0);
1257 u->cgroup_subtree_mask = m;
1258 u->cgroup_subtree_mask_valid = true;
1260 if (UNIT_ISSET(u->slice)) {
1261 Unit *s = UNIT_DEREF(u->slice);
1264 /* There's more set now than before. We
1265 * propagate the new mask to the parent's mask
1266 * (not caring if it actually was valid or
1269 s->cgroup_members_mask |= m;
1272 /* There's less set now than before (or we
1273 * don't know), we need to recalculate
1274 * everything, so let's invalidate the
1275 * parent's members mask */
1277 s->cgroup_members_mask_valid = false;
1279 /* And now make sure that this change also hits our
1281 unit_update_cgroup_members_masks(s);
1285 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1292 if (u->cgroup_path &&
1293 u->cgroup_realized &&
1294 (u->cgroup_realized_mask & mask) == mask)
1295 return u->cgroup_path;
1297 u = UNIT_DEREF(u->slice);
1303 char *unit_default_cgroup_path(Unit *u) {
1304 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1309 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1310 return strdup(u->manager->cgroup_root);
1312 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1313 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1318 escaped = cg_escape(u->id);
1323 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1326 return strjoin(u->manager->cgroup_root, "/", escaped);
1329 int unit_set_cgroup_path(Unit *u, const char *path) {
1330 _cleanup_free_ char *p = NULL;
1342 if (streq_ptr(u->cgroup_path, p))
1346 r = hashmap_put(u->manager->cgroup_unit, p, u);
1351 unit_release_cgroup(u);
1359 int unit_watch_cgroup(Unit *u) {
1360 _cleanup_free_ char *events = NULL;
1365 if (!u->cgroup_path)
1368 if (u->cgroup_inotify_wd >= 0)
1371 /* Only applies to the unified hierarchy */
1372 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1374 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1378 /* Don't watch the root slice, it's pointless. */
1379 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1382 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1386 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1390 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1391 if (u->cgroup_inotify_wd < 0) {
1393 if (errno == ENOENT) /* If the directory is already
1394 * gone we don't need to track
1395 * it, so this is not an error */
1398 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1401 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1403 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1408 int unit_pick_cgroup_path(Unit *u) {
1409 _cleanup_free_ char *path = NULL;
1417 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1420 path = unit_default_cgroup_path(u);
1424 r = unit_set_cgroup_path(u, path);
1426 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1428 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1433 static int unit_create_cgroup(
1435 CGroupMask target_mask,
1436 CGroupMask enable_mask,
1444 c = unit_get_cgroup_context(u);
1448 /* Figure out our cgroup path */
1449 r = unit_pick_cgroup_path(u);
1453 /* First, create our own group */
1454 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1456 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1458 /* Start watching it */
1459 (void) unit_watch_cgroup(u);
1461 /* Enable all controllers we need */
1462 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1464 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1466 /* Keep track that this is now realized */
1467 u->cgroup_realized = true;
1468 u->cgroup_realized_mask = target_mask;
1469 u->cgroup_enabled_mask = enable_mask;
1470 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1472 if (u->type != UNIT_SLICE && !c->delegate) {
1474 /* Then, possibly move things over, but not if
1475 * subgroups may contain processes, which is the case
1476 * for slice and delegation units. */
1477 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1479 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1485 int unit_attach_pids_to_cgroup(Unit *u) {
1489 r = unit_realize_cgroup(u);
1493 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1500 static void cgroup_xattr_apply(Unit *u) {
1501 char ids[SD_ID128_STRING_MAX];
1506 if (!MANAGER_IS_SYSTEM(u->manager))
1509 if (sd_id128_is_null(u->invocation_id))
1512 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1513 "trusted.invocation_id",
1514 sd_id128_to_string(u->invocation_id, ids), 32,
1517 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1520 static bool unit_has_mask_realized(
1522 CGroupMask target_mask,
1523 CGroupMask enable_mask,
1528 return u->cgroup_realized &&
1529 u->cgroup_realized_mask == target_mask &&
1530 u->cgroup_enabled_mask == enable_mask &&
1531 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1532 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1535 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1538 if (u->in_cgroup_realize_queue)
1541 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1542 u->in_cgroup_realize_queue = true;
1545 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1548 if (!u->in_cgroup_realize_queue)
1551 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1552 u->in_cgroup_realize_queue = false;
1556 /* Check if necessary controllers and attributes for a unit are in place.
1558 * If so, do nothing.
1559 * If not, create paths, move processes over, and set attributes.
1561 * Returns 0 on success and < 0 on failure. */
1562 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1563 CGroupMask target_mask, enable_mask;
1564 bool needs_bpf, apply_bpf;
1569 unit_remove_from_cgroup_realize_queue(u);
1571 target_mask = unit_get_target_mask(u);
1572 enable_mask = unit_get_enable_mask(u);
1573 needs_bpf = unit_get_needs_bpf(u);
1575 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1578 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1579 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1580 * this will trickle down properly to cgroupfs. */
1581 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1583 /* First, realize parents */
1584 if (UNIT_ISSET(u->slice)) {
1585 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1590 /* And then do the real work */
1591 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1595 /* Finally, apply the necessary attributes. */
1596 cgroup_context_apply(u, target_mask, apply_bpf, state);
1597 cgroup_xattr_apply(u);
1602 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1610 state = manager_state(m);
1612 while ((i = m->cgroup_realize_queue)) {
1613 assert(i->in_cgroup_realize_queue);
1615 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1616 /* Maybe things changed, and the unit is not actually active anymore? */
1617 unit_remove_from_cgroup_realize_queue(i);
1621 r = unit_realize_cgroup_now(i, state);
1623 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1631 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1634 /* This adds the siblings of the specified unit and the
1635 * siblings of all parent units to the cgroup queue. (But
1636 * neither the specified unit itself nor the parents.) */
1638 while ((slice = UNIT_DEREF(u->slice))) {
1643 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1647 /* Skip units that have a dependency on the slice
1648 * but aren't actually in it. */
1649 if (UNIT_DEREF(m->slice) != slice)
1652 /* No point in doing cgroup application for units
1653 * without active processes. */
1654 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1657 /* If the unit doesn't need any new controllers
1658 * and has current ones realized, it doesn't need
1660 if (unit_has_mask_realized(m,
1661 unit_get_target_mask(m),
1662 unit_get_enable_mask(m),
1663 unit_get_needs_bpf(m)))
1666 unit_add_to_cgroup_realize_queue(m);
1673 int unit_realize_cgroup(Unit *u) {
1676 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1679 /* So, here's the deal: when realizing the cgroups for this
1680 * unit, we need to first create all parents, but there's more
1681 * actually: for the weight-based controllers we also need to
1682 * make sure that all our siblings (i.e. units that are in the
1683 * same slice as we are) have cgroups, too. Otherwise, things
1684 * would become very uneven as each of their processes would
1685 * get as much resources as all our group together. This call
1686 * will synchronously create the parent cgroups, but will
1687 * defer work on the siblings to the next event loop
1690 /* Add all sibling slices to the cgroup queue. */
1691 unit_add_siblings_to_cgroup_realize_queue(u);
1693 /* And realize this one now (and apply the values) */
1694 return unit_realize_cgroup_now(u, manager_state(u->manager));
1697 void unit_release_cgroup(Unit *u) {
1700 /* Forgets all cgroup details for this cgroup */
1702 if (u->cgroup_path) {
1703 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1704 u->cgroup_path = mfree(u->cgroup_path);
1707 if (u->cgroup_inotify_wd >= 0) {
1708 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1709 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1711 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1712 u->cgroup_inotify_wd = -1;
1716 void unit_prune_cgroup(Unit *u) {
1722 /* Removes the cgroup, if empty and possible, and stops watching it. */
1724 if (!u->cgroup_path)
1727 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1729 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1731 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1733 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1740 unit_release_cgroup(u);
1742 u->cgroup_realized = false;
1743 u->cgroup_realized_mask = 0;
1744 u->cgroup_enabled_mask = 0;
1747 int unit_search_main_pid(Unit *u, pid_t *ret) {
1748 _cleanup_fclose_ FILE *f = NULL;
1749 pid_t pid = 0, npid, mypid;
1755 if (!u->cgroup_path)
1758 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1762 mypid = getpid_cached();
1763 while (cg_read_pid(f, &npid) > 0) {
1769 /* Ignore processes that aren't our kids */
1770 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1774 /* Dang, there's more than one daemonized PID
1775 in this group, so we don't know what process
1776 is the main process. */
1787 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1788 _cleanup_closedir_ DIR *d = NULL;
1789 _cleanup_fclose_ FILE *f = NULL;
1795 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1801 while ((r = cg_read_pid(f, &pid)) > 0) {
1802 r = unit_watch_pid(u, pid);
1803 if (r < 0 && ret >= 0)
1807 if (r < 0 && ret >= 0)
1811 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1818 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1819 _cleanup_free_ char *p = NULL;
1821 p = strjoin(path, "/", fn);
1827 r = unit_watch_pids_in_path(u, p);
1828 if (r < 0 && ret >= 0)
1832 if (r < 0 && ret >= 0)
1839 int unit_watch_all_pids(Unit *u) {
1844 /* Adds all PIDs from our cgroup to the set of PIDs we
1845 * watch. This is a fallback logic for cases where we do not
1846 * get reliable cgroup empty notifications: we try to use
1847 * SIGCHLD as replacement. */
1849 if (!u->cgroup_path)
1852 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1855 if (r > 0) /* On unified we can use proper notifications */
1858 return unit_watch_pids_in_path(u, u->cgroup_path);
1861 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1862 Manager *m = userdata;
1869 u = m->cgroup_empty_queue;
1873 assert(u->in_cgroup_empty_queue);
1874 u->in_cgroup_empty_queue = false;
1875 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1877 if (m->cgroup_empty_queue) {
1878 /* More stuff queued, let's make sure we remain enabled */
1879 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1881 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1884 unit_add_to_gc_queue(u);
1886 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1887 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1892 void unit_add_to_cgroup_empty_queue(Unit *u) {
1897 /* Note that there are four different ways how cgroup empty events reach us:
1899 * 1. On the unified hierarchy we get an inotify event on the cgroup
1901 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1903 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1905 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1906 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1908 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1909 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1910 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1911 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1912 * case for scope units). */
1914 if (u->in_cgroup_empty_queue)
1917 /* Let's verify that the cgroup is really empty */
1918 if (!u->cgroup_path)
1920 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1922 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1928 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1929 u->in_cgroup_empty_queue = true;
1931 /* Trigger the defer event */
1932 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1934 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1937 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1938 Manager *m = userdata;
1945 union inotify_event_buffer buffer;
1946 struct inotify_event *e;
1949 l = read(fd, &buffer, sizeof(buffer));
1951 if (IN_SET(errno, EINTR, EAGAIN))
1954 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1957 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1961 /* Queue overflow has no watch descriptor */
1964 if (e->mask & IN_IGNORED)
1965 /* The watch was just removed */
1968 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1969 if (!u) /* Not that inotify might deliver
1970 * events for a watch even after it
1971 * was removed, because it was queued
1972 * before the removal. Let's ignore
1973 * this here safely. */
1976 unit_add_to_cgroup_empty_queue(u);
1982 int manager_setup_cgroup(Manager *m) {
1983 _cleanup_free_ char *path = NULL;
1984 const char *scope_path;
1987 #if 0 /// UNNEEDED by elogind
1993 /* 1. Determine hierarchy */
1994 m->cgroup_root = mfree(m->cgroup_root);
1995 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
1996 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1998 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2001 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2003 #if 0 /// elogind does not support systemd scopes and slices
2004 /* Chop off the init scope, if we are already located in it */
2005 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2007 /* LEGACY: Also chop off the system slice if we are in
2008 * it. This is to support live upgrades from older systemd
2009 * versions where PID 1 was moved there. Also see
2010 * cg_get_root_path(). */
2011 if (!e && MANAGER_IS_SYSTEM(m)) {
2012 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2014 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2020 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2021 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2022 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2023 * easily prepend it everywhere. */
2024 delete_trailing_chars(m->cgroup_root, "/");
2027 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2029 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2031 r = cg_unified_flush();
2033 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2035 all_unified = cg_all_unified();
2036 if (all_unified < 0)
2037 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2038 if (all_unified > 0)
2039 log_debug("Unified cgroup hierarchy is located at %s.", path);
2041 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2043 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2045 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2047 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2050 #if 0 /// elogind is not init, and does not install the agent here.
2051 /* 3. Allocate cgroup empty defer event source */
2052 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2053 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2055 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2057 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2059 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2061 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2063 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2065 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2067 /* 4. Install notifier inotify object, or agent */
2068 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2070 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2072 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2073 safe_close(m->cgroup_inotify_fd);
2075 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2076 if (m->cgroup_inotify_fd < 0)
2077 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2079 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2081 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2083 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2084 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2085 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2087 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2089 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2091 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2093 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2094 * since it does not generate events when control groups with children run empty. */
2096 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2098 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2100 log_debug("Installed release agent.");
2102 log_debug("Release agent already installed.");
2105 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2106 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2107 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2110 * This method is in core, and normally called by systemd
2111 * being init. As elogind is never init, we can not install
2112 * our agent here. We do so when mounting our cgroup file
2113 * system, so only if elogind is its own tiny controller.
2114 * Further, elogind is not meant to run in systemd init scope. */
2115 if (MANAGER_IS_SYSTEM(m))
2116 // we are our own cgroup controller
2117 scope_path = strjoina("");
2118 else if (streq(m->cgroup_root, "/elogind"))
2119 // root already is our cgroup
2120 scope_path = strjoina(m->cgroup_root);
2122 // we have to create our own group
2123 scope_path = strjoina(m->cgroup_root, "/elogind");
2124 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2127 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2128 log_debug_elogind("Created control group \"%s\"", scope_path);
2130 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2131 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2132 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2134 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2137 /* 6. And pin it, so that it cannot be unmounted */
2138 safe_close(m->pin_cgroupfs_fd);
2139 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2140 if (m->pin_cgroupfs_fd < 0)
2141 return log_error_errno(errno, "Failed to open pin file: %m");
2143 /* 7. Always enable hierarchical support if it exists... */
2144 if (!all_unified && m->test_run_flags == 0)
2145 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2147 /* 8. Figure out which controllers are supported, and log about it */
2148 r = cg_mask_supported(&m->cgroup_supported);
2150 return log_error_errno(r, "Failed to determine supported controllers: %m");
2151 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2152 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2157 void manager_shutdown_cgroup(Manager *m, bool delete) {
2160 /* We can't really delete the group, since we are in it. But
2162 if (delete && m->cgroup_root)
2163 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2165 #if 0 /// elogind is not init
2166 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2168 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2170 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2171 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2174 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2176 m->cgroup_root = mfree(m->cgroup_root);
2179 #if 0 /// UNNEEDED by elogind
2180 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2187 u = hashmap_get(m->cgroup_unit, cgroup);
2191 p = strdupa(cgroup);
2195 e = strrchr(p, '/');
2197 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2201 u = hashmap_get(m->cgroup_unit, p);
2207 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2208 _cleanup_free_ char *cgroup = NULL;
2216 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
2220 return manager_get_unit_by_cgroup(m, cgroup);
2223 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2232 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2234 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
2238 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
2242 return manager_get_unit_by_pid_cgroup(m, pid);
2246 #if 0 /// elogind must substitute this with its own variant
2247 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2253 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2254 * or from the --system instance */
2256 log_debug("Got cgroup empty notification for: %s", cgroup);
2258 u = manager_get_unit_by_cgroup(m, cgroup);
2262 unit_add_to_cgroup_empty_queue(u);
2266 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2272 log_debug("Got cgroup empty notification for: %s", cgroup);
2274 s = hashmap_get(m->sessions, cgroup);
2277 session_finalize(s);
2280 log_warning("Session not found: %s", cgroup);
2285 #if 0 /// UNNEEDED by elogind
2286 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2287 _cleanup_free_ char *v = NULL;
2293 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2296 if (!u->cgroup_path)
2299 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2302 r = cg_all_unified();
2306 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2308 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2314 return safe_atou64(v, ret);
2317 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2318 _cleanup_free_ char *v = NULL;
2324 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2327 if (!u->cgroup_path)
2330 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2333 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2339 return safe_atou64(v, ret);
2342 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2343 _cleanup_free_ char *v = NULL;
2350 if (!u->cgroup_path)
2353 r = cg_all_unified();
2357 const char *keys[] = { "usage_usec", NULL };
2358 _cleanup_free_ char *val = NULL;
2361 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2364 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2368 r = safe_atou64(val, &us);
2372 ns = us * NSEC_PER_USEC;
2374 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2377 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2383 r = safe_atou64(v, &ns);
2392 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2398 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2399 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2400 * call this function with a NULL return value. */
2402 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2405 r = unit_get_cpu_usage_raw(u, &ns);
2406 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2407 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2411 *ret = u->cpu_usage_last;
2417 if (ns > u->cpu_usage_base)
2418 ns -= u->cpu_usage_base;
2422 u->cpu_usage_last = ns;
2429 int unit_get_ip_accounting(
2431 CGroupIPAccountingMetric metric,
2438 assert(metric >= 0);
2439 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2442 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2443 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2444 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2446 if (u->type == UNIT_SLICE)
2449 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2452 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2453 u->ip_accounting_ingress_map_fd :
2454 u->ip_accounting_egress_map_fd;
2458 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2459 r = bpf_firewall_read_accounting(fd, &value, NULL);
2461 r = bpf_firewall_read_accounting(fd, NULL, &value);
2465 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2466 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2467 * ip_accounting_extra[] field, and add them in here transparently. */
2469 *ret = value + u->ip_accounting_extra[metric];
2474 int unit_reset_cpu_accounting(Unit *u) {
2480 u->cpu_usage_last = NSEC_INFINITY;
2482 r = unit_get_cpu_usage_raw(u, &ns);
2484 u->cpu_usage_base = 0;
2488 u->cpu_usage_base = ns;
2492 int unit_reset_ip_accounting(Unit *u) {
2497 if (u->ip_accounting_ingress_map_fd >= 0)
2498 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2500 if (u->ip_accounting_egress_map_fd >= 0)
2501 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2503 zero(u->ip_accounting_extra);
2505 return r < 0 ? r : q;
2508 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2511 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2517 /* always invalidate compat pairs together */
2518 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2519 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2521 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2522 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2524 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2527 u->cgroup_realized_mask &= ~m;
2528 unit_add_to_cgroup_realize_queue(u);
2531 void unit_invalidate_cgroup_bpf(Unit *u) {
2534 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2537 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2540 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2541 unit_add_to_cgroup_realize_queue(u);
2543 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2544 * list of our children includes our own. */
2545 if (u->type == UNIT_SLICE) {
2550 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2554 if (UNIT_DEREF(member->slice) != u)
2557 unit_invalidate_cgroup_bpf(member);
2562 void manager_invalidate_startup_units(Manager *m) {
2568 SET_FOREACH(u, m->startup_units, i)
2569 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2572 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2573 [CGROUP_AUTO] = "auto",
2574 [CGROUP_CLOSED] = "closed",
2575 [CGROUP_STRICT] = "strict",
2578 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);