1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 //#include "blockdev-util.h"
26 //#include "bpf-firewall.h"
27 #include "cgroup-util.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "process-util.h"
35 //#include "procfs-util.h"
36 //#include "special.h"
37 #include "stdio-util.h"
38 #include "string-table.h"
39 #include "string-util.h"
41 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
43 bool unit_has_root_cgroup(Unit *u) {
46 /* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
47 * as inside of containers the root slice won't be identical to the root cgroup. */
52 return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
55 #if 0 /// UNNEEDED by elogind
56 static void cgroup_compat_warn(void) {
57 static bool cgroup_compat_warned = false;
59 if (cgroup_compat_warned)
62 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
63 cgroup_compat_warned = true;
66 #define log_cgroup_compat(unit, fmt, ...) do { \
67 cgroup_compat_warn(); \
68 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
71 void cgroup_context_init(CGroupContext *c) {
74 /* Initialize everything to the kernel defaults, assuming the
75 * structure is preinitialized to 0 */
77 c->cpu_weight = CGROUP_WEIGHT_INVALID;
78 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
79 c->cpu_quota_per_sec_usec = USEC_INFINITY;
81 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
82 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
84 c->memory_high = CGROUP_LIMIT_MAX;
85 c->memory_max = CGROUP_LIMIT_MAX;
86 c->memory_swap_max = CGROUP_LIMIT_MAX;
88 c->memory_limit = CGROUP_LIMIT_MAX;
90 c->io_weight = CGROUP_WEIGHT_INVALID;
91 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
93 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
94 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
96 c->tasks_max = (uint64_t) -1;
99 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
103 LIST_REMOVE(device_allow, c->device_allow, a);
108 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
112 LIST_REMOVE(device_weights, c->io_device_weights, w);
117 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
121 LIST_REMOVE(device_limits, c->io_device_limits, l);
126 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
130 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
135 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
139 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
144 void cgroup_context_done(CGroupContext *c) {
147 while (c->io_device_weights)
148 cgroup_context_free_io_device_weight(c, c->io_device_weights);
150 while (c->io_device_limits)
151 cgroup_context_free_io_device_limit(c, c->io_device_limits);
153 while (c->blockio_device_weights)
154 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
156 while (c->blockio_device_bandwidths)
157 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
159 while (c->device_allow)
160 cgroup_context_free_device_allow(c, c->device_allow);
162 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
163 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
166 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
167 CGroupIODeviceLimit *il;
168 CGroupIODeviceWeight *iw;
169 CGroupBlockIODeviceBandwidth *b;
170 CGroupBlockIODeviceWeight *w;
171 CGroupDeviceAllow *a;
172 IPAddressAccessItem *iaai;
173 char u[FORMAT_TIMESPAN_MAX];
178 prefix = strempty(prefix);
181 "%sCPUAccounting=%s\n"
182 "%sIOAccounting=%s\n"
183 "%sBlockIOAccounting=%s\n"
184 "%sMemoryAccounting=%s\n"
185 "%sTasksAccounting=%s\n"
186 "%sIPAccounting=%s\n"
187 "%sCPUWeight=%" PRIu64 "\n"
188 "%sStartupCPUWeight=%" PRIu64 "\n"
189 "%sCPUShares=%" PRIu64 "\n"
190 "%sStartupCPUShares=%" PRIu64 "\n"
191 "%sCPUQuotaPerSecSec=%s\n"
192 "%sIOWeight=%" PRIu64 "\n"
193 "%sStartupIOWeight=%" PRIu64 "\n"
194 "%sBlockIOWeight=%" PRIu64 "\n"
195 "%sStartupBlockIOWeight=%" PRIu64 "\n"
196 "%sMemoryLow=%" PRIu64 "\n"
197 "%sMemoryHigh=%" PRIu64 "\n"
198 "%sMemoryMax=%" PRIu64 "\n"
199 "%sMemorySwapMax=%" PRIu64 "\n"
200 "%sMemoryLimit=%" PRIu64 "\n"
201 "%sTasksMax=%" PRIu64 "\n"
202 "%sDevicePolicy=%s\n"
204 prefix, yes_no(c->cpu_accounting),
205 prefix, yes_no(c->io_accounting),
206 prefix, yes_no(c->blockio_accounting),
207 prefix, yes_no(c->memory_accounting),
208 prefix, yes_no(c->tasks_accounting),
209 prefix, yes_no(c->ip_accounting),
210 prefix, c->cpu_weight,
211 prefix, c->startup_cpu_weight,
212 prefix, c->cpu_shares,
213 prefix, c->startup_cpu_shares,
214 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
215 prefix, c->io_weight,
216 prefix, c->startup_io_weight,
217 prefix, c->blockio_weight,
218 prefix, c->startup_blockio_weight,
219 prefix, c->memory_low,
220 prefix, c->memory_high,
221 prefix, c->memory_max,
222 prefix, c->memory_swap_max,
223 prefix, c->memory_limit,
224 prefix, c->tasks_max,
225 prefix, cgroup_device_policy_to_string(c->device_policy),
226 prefix, yes_no(c->delegate));
229 _cleanup_free_ char *t = NULL;
231 (void) cg_mask_to_string(c->delegate_controllers, &t);
233 fprintf(f, "%sDelegateControllers=%s\n",
238 LIST_FOREACH(device_allow, a, c->device_allow)
240 "%sDeviceAllow=%s %s%s%s\n",
243 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
245 LIST_FOREACH(device_weights, iw, c->io_device_weights)
247 "%sIODeviceWeight=%s %" PRIu64,
252 LIST_FOREACH(device_limits, il, c->io_device_limits) {
253 char buf[FORMAT_BYTES_MAX];
254 CGroupIOLimitType type;
256 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
257 if (il->limits[type] != cgroup_io_limit_defaults[type])
261 cgroup_io_limit_type_to_string(type),
263 format_bytes(buf, sizeof(buf), il->limits[type]));
266 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
268 "%sBlockIODeviceWeight=%s %" PRIu64,
273 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
274 char buf[FORMAT_BYTES_MAX];
276 if (b->rbps != CGROUP_LIMIT_MAX)
278 "%sBlockIOReadBandwidth=%s %s\n",
281 format_bytes(buf, sizeof(buf), b->rbps));
282 if (b->wbps != CGROUP_LIMIT_MAX)
284 "%sBlockIOWriteBandwidth=%s %s\n",
287 format_bytes(buf, sizeof(buf), b->wbps));
290 LIST_FOREACH(items, iaai, c->ip_address_allow) {
291 _cleanup_free_ char *k = NULL;
293 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
294 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
297 LIST_FOREACH(items, iaai, c->ip_address_deny) {
298 _cleanup_free_ char *k = NULL;
300 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
301 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
305 static int lookup_block_device(const char *p, dev_t *dev) {
314 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
316 if (S_ISBLK(st.st_mode))
318 else if (major(st.st_dev) != 0) {
319 /* If this is not a device node then find the block
320 * device this file is stored on */
323 /* If this is a partition, try to get the originating
325 (void) block_get_whole_disk(*dev, dev);
327 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
334 static int whitelist_device(const char *path, const char *node, const char *acc) {
335 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
337 bool ignore_notfound;
343 if (node[0] == '-') {
344 /* Non-existent paths starting with "-" must be silently ignored */
346 ignore_notfound = true;
348 ignore_notfound = false;
350 if (stat(node, &st) < 0) {
351 if (errno == ENOENT && ignore_notfound)
354 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
357 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
358 log_warning("%s is not a device.", node);
364 S_ISCHR(st.st_mode) ? 'c' : 'b',
365 major(st.st_rdev), minor(st.st_rdev),
368 r = cg_set_attribute("devices", path, "devices.allow", buf);
370 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
371 "Failed to set devices.allow on %s: %m", path);
376 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
377 _cleanup_fclose_ FILE *f = NULL;
384 assert(IN_SET(type, 'b', 'c'));
386 f = fopen("/proc/devices", "re");
388 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
390 FOREACH_LINE(line, f, goto fail) {
391 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
396 if (type == 'c' && streq(line, "Character devices:")) {
401 if (type == 'b' && streq(line, "Block devices:")) {
416 w = strpbrk(p, WHITESPACE);
421 r = safe_atou(p, &maj);
428 w += strspn(w, WHITESPACE);
430 if (fnmatch(name, w, 0) != 0)
439 r = cg_set_attribute("devices", path, "devices.allow", buf);
441 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
442 "Failed to set devices.allow on %s: %m", path);
448 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
451 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
452 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
453 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
456 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
457 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
458 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
461 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
462 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
463 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
464 return c->startup_cpu_weight;
465 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
466 return c->cpu_weight;
468 return CGROUP_WEIGHT_DEFAULT;
471 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
472 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
473 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
474 return c->startup_cpu_shares;
475 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
476 return c->cpu_shares;
478 return CGROUP_CPU_SHARES_DEFAULT;
481 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
482 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
485 xsprintf(buf, "%" PRIu64 "\n", weight);
486 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
488 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
489 "Failed to set cpu.weight: %m");
491 if (quota != USEC_INFINITY)
492 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
493 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
495 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
497 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
500 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
501 "Failed to set cpu.max: %m");
504 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
505 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
508 xsprintf(buf, "%" PRIu64 "\n", shares);
509 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
511 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
512 "Failed to set cpu.shares: %m");
514 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
515 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
517 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
518 "Failed to set cpu.cfs_period_us: %m");
520 if (quota != USEC_INFINITY) {
521 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
522 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
524 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
526 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
527 "Failed to set cpu.cfs_quota_us: %m");
530 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
531 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
532 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
535 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
536 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
537 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
540 static bool cgroup_context_has_io_config(CGroupContext *c) {
541 return c->io_accounting ||
542 c->io_weight != CGROUP_WEIGHT_INVALID ||
543 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
544 c->io_device_weights ||
548 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
549 return c->blockio_accounting ||
550 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
551 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
552 c->blockio_device_weights ||
553 c->blockio_device_bandwidths;
556 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
557 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
558 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
559 return c->startup_io_weight;
560 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
563 return CGROUP_WEIGHT_DEFAULT;
566 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
567 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
568 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
569 return c->startup_blockio_weight;
570 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
571 return c->blockio_weight;
573 return CGROUP_BLKIO_WEIGHT_DEFAULT;
576 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
577 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
578 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
581 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
582 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
583 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
586 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
587 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
591 r = lookup_block_device(dev_path, &dev);
595 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
596 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
598 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
599 "Failed to set io.weight: %m");
602 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
603 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
607 r = lookup_block_device(dev_path, &dev);
611 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
612 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
614 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
615 "Failed to set blkio.weight_device: %m");
618 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
619 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
620 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
621 CGroupIOLimitType type;
626 r = lookup_block_device(dev_path, &dev);
630 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
631 if (limits[type] != cgroup_io_limit_defaults[type]) {
632 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
635 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
639 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
640 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
641 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
642 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
644 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
645 "Failed to set io.max: %m");
649 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
650 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
655 r = lookup_block_device(dev_path, &dev);
659 if (rbps != CGROUP_LIMIT_MAX)
661 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
662 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
664 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
665 "Failed to set blkio.throttle.read_bps_device: %m");
667 if (wbps != CGROUP_LIMIT_MAX)
669 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
670 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
672 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
673 "Failed to set blkio.throttle.write_bps_device: %m");
678 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
679 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
682 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
683 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
686 if (v != CGROUP_LIMIT_MAX)
687 xsprintf(buf, "%" PRIu64 "\n", v);
689 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
691 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
692 "Failed to set %s: %m", file);
695 static void cgroup_apply_firewall(Unit *u) {
700 if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
701 * not recursive we don't ever touch the bpf on them */
704 r = bpf_firewall_compile(u);
708 (void) bpf_firewall_install(u);
712 static void cgroup_context_apply(
714 CGroupMask apply_mask,
716 ManagerState state) {
725 /* Nothing to do? Exit early! */
726 if (apply_mask == 0 && !apply_bpf)
729 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
730 is_root = unit_has_root_cgroup(u);
732 assert_se(c = unit_get_cgroup_context(u));
733 assert_se(path = u->cgroup_path);
735 if (is_root) /* Make sure we don't try to display messages with an empty path. */
738 /* We generally ignore errors caused by read-only mounted
739 * cgroup trees (assuming we are running in a container then),
740 * and missing cgroups, i.e. EROFS and ENOENT. */
742 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
743 bool has_weight, has_shares;
745 has_weight = cgroup_context_has_cpu_weight(c);
746 has_shares = cgroup_context_has_cpu_shares(c);
748 if (cg_all_unified() > 0) {
752 weight = cgroup_context_cpu_weight(c, state);
753 else if (has_shares) {
754 uint64_t shares = cgroup_context_cpu_shares(c, state);
756 weight = cgroup_cpu_shares_to_weight(shares);
758 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
759 shares, weight, path);
761 weight = CGROUP_WEIGHT_DEFAULT;
763 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
768 uint64_t weight = cgroup_context_cpu_weight(c, state);
770 shares = cgroup_cpu_weight_to_shares(weight);
772 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
773 weight, shares, path);
774 } else if (has_shares)
775 shares = cgroup_context_cpu_shares(c, state);
777 shares = CGROUP_CPU_SHARES_DEFAULT;
779 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
783 if (apply_mask & CGROUP_MASK_IO) {
784 bool has_io = cgroup_context_has_io_config(c);
785 bool has_blockio = cgroup_context_has_blockio_config(c);
788 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
792 weight = cgroup_context_io_weight(c, state);
793 else if (has_blockio) {
794 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
796 weight = cgroup_weight_blkio_to_io(blkio_weight);
798 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
799 blkio_weight, weight);
801 weight = CGROUP_WEIGHT_DEFAULT;
803 xsprintf(buf, "default %" PRIu64 "\n", weight);
804 r = cg_set_attribute("io", path, "io.weight", buf);
806 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
807 "Failed to set io.weight: %m");
810 CGroupIODeviceWeight *w;
812 /* FIXME: no way to reset this list */
813 LIST_FOREACH(device_weights, w, c->io_device_weights)
814 cgroup_apply_io_device_weight(u, w->path, w->weight);
815 } else if (has_blockio) {
816 CGroupBlockIODeviceWeight *w;
818 /* FIXME: no way to reset this list */
819 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
820 weight = cgroup_weight_blkio_to_io(w->weight);
822 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
823 w->weight, weight, w->path);
825 cgroup_apply_io_device_weight(u, w->path, weight);
830 /* Apply limits and free ones without config. */
832 CGroupIODeviceLimit *l, *next;
834 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
835 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
836 cgroup_context_free_io_device_limit(c, l);
838 } else if (has_blockio) {
839 CGroupBlockIODeviceBandwidth *b, *next;
841 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
842 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
843 CGroupIOLimitType type;
845 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
846 limits[type] = cgroup_io_limit_defaults[type];
848 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
849 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
851 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
852 b->rbps, b->wbps, b->path);
854 if (!cgroup_apply_io_device_limit(u, b->path, limits))
855 cgroup_context_free_blockio_device_bandwidth(c, b);
860 if (apply_mask & CGROUP_MASK_BLKIO) {
861 bool has_io = cgroup_context_has_io_config(c);
862 bool has_blockio = cgroup_context_has_blockio_config(c);
865 char buf[DECIMAL_STR_MAX(uint64_t)+1];
869 uint64_t io_weight = cgroup_context_io_weight(c, state);
871 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
873 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
875 } else if (has_blockio)
876 weight = cgroup_context_blkio_weight(c, state);
878 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
880 xsprintf(buf, "%" PRIu64 "\n", weight);
881 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
883 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
884 "Failed to set blkio.weight: %m");
887 CGroupIODeviceWeight *w;
889 /* FIXME: no way to reset this list */
890 LIST_FOREACH(device_weights, w, c->io_device_weights) {
891 weight = cgroup_weight_io_to_blkio(w->weight);
893 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
894 w->weight, weight, w->path);
896 cgroup_apply_blkio_device_weight(u, w->path, weight);
898 } else if (has_blockio) {
899 CGroupBlockIODeviceWeight *w;
901 /* FIXME: no way to reset this list */
902 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
903 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
907 /* Apply limits and free ones without config. */
909 CGroupIODeviceLimit *l, *next;
911 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
912 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
913 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
915 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
916 cgroup_context_free_io_device_limit(c, l);
918 } else if (has_blockio) {
919 CGroupBlockIODeviceBandwidth *b, *next;
921 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
922 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
923 cgroup_context_free_blockio_device_bandwidth(c, b);
927 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
928 if (cg_all_unified() > 0) {
929 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
931 if (cgroup_context_has_unified_memory_config(c)) {
933 swap_max = c->memory_swap_max;
935 max = c->memory_limit;
937 if (max != CGROUP_LIMIT_MAX)
938 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
941 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
942 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
943 cgroup_apply_unified_memory_limit(u, "memory.max", max);
944 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
946 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
949 if (cgroup_context_has_unified_memory_config(c)) {
951 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
953 val = c->memory_limit;
955 if (val == CGROUP_LIMIT_MAX)
956 strncpy(buf, "-1\n", sizeof(buf));
958 xsprintf(buf, "%" PRIu64 "\n", val);
960 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
962 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
963 "Failed to set memory.limit_in_bytes: %m");
967 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
968 CGroupDeviceAllow *a;
970 /* Changing the devices list of a populated cgroup
971 * might result in EINVAL, hence ignore EINVAL
974 if (c->device_allow || c->device_policy != CGROUP_AUTO)
975 r = cg_set_attribute("devices", path, "devices.deny", "a");
977 r = cg_set_attribute("devices", path, "devices.allow", "a");
979 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
980 "Failed to reset devices.list: %m");
982 if (c->device_policy == CGROUP_CLOSED ||
983 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
984 static const char auto_devices[] =
985 "/dev/null\0" "rwm\0"
986 "/dev/zero\0" "rwm\0"
987 "/dev/full\0" "rwm\0"
988 "/dev/random\0" "rwm\0"
989 "/dev/urandom\0" "rwm\0"
991 "/dev/ptmx\0" "rwm\0"
992 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
993 "-/run/systemd/inaccessible/chr\0" "rwm\0"
994 "-/run/systemd/inaccessible/blk\0" "rwm\0";
998 NULSTR_FOREACH_PAIR(x, y, auto_devices)
999 whitelist_device(path, x, y);
1001 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1002 whitelist_major(path, "pts", 'c', "rw");
1005 LIST_FOREACH(device_allow, a, c->device_allow) {
1021 if (path_startswith(a->path, "/dev/"))
1022 whitelist_device(path, a->path, acc);
1023 else if ((val = startswith(a->path, "block-")))
1024 whitelist_major(path, val, 'b', acc);
1025 else if ((val = startswith(a->path, "char-")))
1026 whitelist_major(path, val, 'c', acc);
1028 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1032 if (apply_mask & CGROUP_MASK_PIDS) {
1035 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1036 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1037 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1038 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1039 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1040 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1041 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1042 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1043 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1044 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1045 * which is desirable so that there's an offical way to release control of the sysctl from
1046 * systemd: set the limit to unbounded and reload. */
1048 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1049 u->manager->sysctl_pid_max_changed = true;
1050 r = procfs_tasks_set_limit(c->tasks_max);
1051 } else if (u->manager->sysctl_pid_max_changed)
1052 r = procfs_tasks_set_limit(TASKS_MAX);
1057 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1058 "Failed to write to tasks limit sysctls: %m");
1061 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1062 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1064 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1065 r = cg_set_attribute("pids", path, "pids.max", buf);
1067 r = cg_set_attribute("pids", path, "pids.max", "max");
1069 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070 "Failed to set pids.max: %m");
1075 cgroup_apply_firewall(u);
1078 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1079 CGroupMask mask = 0;
1081 /* Figure out which controllers we need */
1083 if (c->cpu_accounting ||
1084 cgroup_context_has_cpu_weight(c) ||
1085 cgroup_context_has_cpu_shares(c) ||
1086 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1087 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1089 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1090 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1092 if (c->memory_accounting ||
1093 c->memory_limit != CGROUP_LIMIT_MAX ||
1094 cgroup_context_has_unified_memory_config(c))
1095 mask |= CGROUP_MASK_MEMORY;
1097 if (c->device_allow ||
1098 c->device_policy != CGROUP_AUTO)
1099 mask |= CGROUP_MASK_DEVICES;
1101 if (c->tasks_accounting ||
1102 c->tasks_max != CGROUP_LIMIT_MAX)
1103 mask |= CGROUP_MASK_PIDS;
1108 CGroupMask unit_get_own_mask(Unit *u) {
1111 /* Returns the mask of controllers the unit needs for itself */
1113 c = unit_get_cgroup_context(u);
1117 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1120 CGroupMask unit_get_delegate_mask(Unit *u) {
1123 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1124 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1126 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1128 if (u->type == UNIT_SLICE)
1131 c = unit_get_cgroup_context(u);
1138 if (cg_all_unified() <= 0) {
1141 e = unit_get_exec_context(u);
1142 if (e && !exec_context_maintains_privileges(e))
1146 return c->delegate_controllers;
1149 CGroupMask unit_get_members_mask(Unit *u) {
1152 /* Returns the mask of controllers all of the unit's children require, merged */
1154 if (u->cgroup_members_mask_valid)
1155 return u->cgroup_members_mask;
1157 u->cgroup_members_mask = 0;
1159 if (u->type == UNIT_SLICE) {
1164 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1169 if (UNIT_DEREF(member->slice) != u)
1172 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1176 u->cgroup_members_mask_valid = true;
1177 return u->cgroup_members_mask;
1180 CGroupMask unit_get_siblings_mask(Unit *u) {
1183 /* Returns the mask of controllers all of the unit's siblings
1184 * require, i.e. the members mask of the unit's parent slice
1185 * if there is one. */
1187 if (UNIT_ISSET(u->slice))
1188 return unit_get_members_mask(UNIT_DEREF(u->slice));
1190 return unit_get_subtree_mask(u); /* we are the top-level slice */
1193 CGroupMask unit_get_subtree_mask(Unit *u) {
1195 /* Returns the mask of this subtree, meaning of the group
1196 * itself and its children. */
1198 return unit_get_own_mask(u) | unit_get_members_mask(u);
1201 CGroupMask unit_get_target_mask(Unit *u) {
1204 /* This returns the cgroup mask of all controllers to enable
1205 * for a specific cgroup, i.e. everything it needs itself,
1206 * plus all that its children need, plus all that its siblings
1207 * need. This is primarily useful on the legacy cgroup
1208 * hierarchy, where we need to duplicate each cgroup in each
1209 * hierarchy that shall be enabled for it. */
1211 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1212 mask &= u->manager->cgroup_supported;
1217 CGroupMask unit_get_enable_mask(Unit *u) {
1220 /* This returns the cgroup mask of all controllers to enable
1221 * for the children of a specific cgroup. This is primarily
1222 * useful for the unified cgroup hierarchy, where each cgroup
1223 * controls which controllers are enabled for its children. */
1225 mask = unit_get_members_mask(u);
1226 mask &= u->manager->cgroup_supported;
1231 bool unit_get_needs_bpf(Unit *u) {
1236 /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
1238 if (u->type == UNIT_SLICE)
1241 c = unit_get_cgroup_context(u);
1245 if (c->ip_accounting ||
1246 c->ip_address_allow ||
1250 /* If any parent slice has an IP access list defined, it applies too */
1251 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1252 c = unit_get_cgroup_context(p);
1256 if (c->ip_address_allow ||
1264 /* Recurse from a unit up through its containing slices, propagating
1265 * mask bits upward. A unit is also member of itself. */
1266 void unit_update_cgroup_members_masks(Unit *u) {
1272 /* Calculate subtree mask */
1273 m = unit_get_subtree_mask(u);
1275 /* See if anything changed from the previous invocation. If
1276 * not, we're done. */
1277 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1281 u->cgroup_subtree_mask_valid &&
1282 ((m & ~u->cgroup_subtree_mask) != 0) &&
1283 ((~m & u->cgroup_subtree_mask) == 0);
1285 u->cgroup_subtree_mask = m;
1286 u->cgroup_subtree_mask_valid = true;
1288 if (UNIT_ISSET(u->slice)) {
1289 Unit *s = UNIT_DEREF(u->slice);
1292 /* There's more set now than before. We
1293 * propagate the new mask to the parent's mask
1294 * (not caring if it actually was valid or
1297 s->cgroup_members_mask |= m;
1300 /* There's less set now than before (or we
1301 * don't know), we need to recalculate
1302 * everything, so let's invalidate the
1303 * parent's members mask */
1305 s->cgroup_members_mask_valid = false;
1307 /* And now make sure that this change also hits our
1309 unit_update_cgroup_members_masks(s);
1313 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1320 if (u->cgroup_path &&
1321 u->cgroup_realized &&
1322 (u->cgroup_realized_mask & mask) == mask)
1323 return u->cgroup_path;
1325 u = UNIT_DEREF(u->slice);
1331 char *unit_default_cgroup_path(Unit *u) {
1332 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1337 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1338 return strdup(u->manager->cgroup_root);
1340 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1341 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1346 escaped = cg_escape(u->id);
1351 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1354 return strjoin(u->manager->cgroup_root, "/", escaped);
1357 int unit_set_cgroup_path(Unit *u, const char *path) {
1358 _cleanup_free_ char *p = NULL;
1370 if (streq_ptr(u->cgroup_path, p))
1374 r = hashmap_put(u->manager->cgroup_unit, p, u);
1379 unit_release_cgroup(u);
1387 int unit_watch_cgroup(Unit *u) {
1388 _cleanup_free_ char *events = NULL;
1393 if (!u->cgroup_path)
1396 if (u->cgroup_inotify_wd >= 0)
1399 /* Only applies to the unified hierarchy */
1400 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1402 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1406 /* Don't watch the root slice, it's pointless. */
1407 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1410 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1414 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1418 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1419 if (u->cgroup_inotify_wd < 0) {
1421 if (errno == ENOENT) /* If the directory is already
1422 * gone we don't need to track
1423 * it, so this is not an error */
1426 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1429 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1431 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1436 int unit_pick_cgroup_path(Unit *u) {
1437 _cleanup_free_ char *path = NULL;
1445 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1448 path = unit_default_cgroup_path(u);
1452 r = unit_set_cgroup_path(u, path);
1454 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1456 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1461 static int unit_create_cgroup(
1463 CGroupMask target_mask,
1464 CGroupMask enable_mask,
1472 c = unit_get_cgroup_context(u);
1476 /* Figure out our cgroup path */
1477 r = unit_pick_cgroup_path(u);
1481 /* First, create our own group */
1482 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1484 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1486 /* Start watching it */
1487 (void) unit_watch_cgroup(u);
1489 /* Enable all controllers we need */
1490 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1492 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1494 /* Keep track that this is now realized */
1495 u->cgroup_realized = true;
1496 u->cgroup_realized_mask = target_mask;
1497 u->cgroup_enabled_mask = enable_mask;
1498 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1500 if (u->type != UNIT_SLICE && !c->delegate) {
1502 /* Then, possibly move things over, but not if
1503 * subgroups may contain processes, which is the case
1504 * for slice and delegation units. */
1505 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1507 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1513 int unit_attach_pids_to_cgroup(Unit *u) {
1517 r = unit_realize_cgroup(u);
1521 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
1528 static void cgroup_xattr_apply(Unit *u) {
1529 char ids[SD_ID128_STRING_MAX];
1534 if (!MANAGER_IS_SYSTEM(u->manager))
1537 if (sd_id128_is_null(u->invocation_id))
1540 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1541 "trusted.invocation_id",
1542 sd_id128_to_string(u->invocation_id, ids), 32,
1545 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1548 static bool unit_has_mask_realized(
1550 CGroupMask target_mask,
1551 CGroupMask enable_mask,
1556 return u->cgroup_realized &&
1557 u->cgroup_realized_mask == target_mask &&
1558 u->cgroup_enabled_mask == enable_mask &&
1559 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1560 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1563 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1566 if (u->in_cgroup_realize_queue)
1569 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1570 u->in_cgroup_realize_queue = true;
1573 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1576 if (!u->in_cgroup_realize_queue)
1579 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1580 u->in_cgroup_realize_queue = false;
1584 /* Check if necessary controllers and attributes for a unit are in place.
1586 * If so, do nothing.
1587 * If not, create paths, move processes over, and set attributes.
1589 * Returns 0 on success and < 0 on failure. */
1590 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1591 CGroupMask target_mask, enable_mask;
1592 bool needs_bpf, apply_bpf;
1597 unit_remove_from_cgroup_realize_queue(u);
1599 target_mask = unit_get_target_mask(u);
1600 enable_mask = unit_get_enable_mask(u);
1601 needs_bpf = unit_get_needs_bpf(u);
1603 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1606 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1607 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1608 * this will trickle down properly to cgroupfs. */
1609 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1611 /* First, realize parents */
1612 if (UNIT_ISSET(u->slice)) {
1613 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1618 /* And then do the real work */
1619 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1623 /* Finally, apply the necessary attributes. */
1624 cgroup_context_apply(u, target_mask, apply_bpf, state);
1625 cgroup_xattr_apply(u);
1630 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1638 state = manager_state(m);
1640 while ((i = m->cgroup_realize_queue)) {
1641 assert(i->in_cgroup_realize_queue);
1643 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1644 /* Maybe things changed, and the unit is not actually active anymore? */
1645 unit_remove_from_cgroup_realize_queue(i);
1649 r = unit_realize_cgroup_now(i, state);
1651 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1659 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1662 /* This adds the siblings of the specified unit and the
1663 * siblings of all parent units to the cgroup queue. (But
1664 * neither the specified unit itself nor the parents.) */
1666 while ((slice = UNIT_DEREF(u->slice))) {
1671 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1675 /* Skip units that have a dependency on the slice
1676 * but aren't actually in it. */
1677 if (UNIT_DEREF(m->slice) != slice)
1680 /* No point in doing cgroup application for units
1681 * without active processes. */
1682 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1685 /* If the unit doesn't need any new controllers
1686 * and has current ones realized, it doesn't need
1688 if (unit_has_mask_realized(m,
1689 unit_get_target_mask(m),
1690 unit_get_enable_mask(m),
1691 unit_get_needs_bpf(m)))
1694 unit_add_to_cgroup_realize_queue(m);
1701 int unit_realize_cgroup(Unit *u) {
1704 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1707 /* So, here's the deal: when realizing the cgroups for this
1708 * unit, we need to first create all parents, but there's more
1709 * actually: for the weight-based controllers we also need to
1710 * make sure that all our siblings (i.e. units that are in the
1711 * same slice as we are) have cgroups, too. Otherwise, things
1712 * would become very uneven as each of their processes would
1713 * get as much resources as all our group together. This call
1714 * will synchronously create the parent cgroups, but will
1715 * defer work on the siblings to the next event loop
1718 /* Add all sibling slices to the cgroup queue. */
1719 unit_add_siblings_to_cgroup_realize_queue(u);
1721 /* And realize this one now (and apply the values) */
1722 return unit_realize_cgroup_now(u, manager_state(u->manager));
1725 void unit_release_cgroup(Unit *u) {
1728 /* Forgets all cgroup details for this cgroup */
1730 if (u->cgroup_path) {
1731 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1732 u->cgroup_path = mfree(u->cgroup_path);
1735 if (u->cgroup_inotify_wd >= 0) {
1736 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1737 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1739 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1740 u->cgroup_inotify_wd = -1;
1744 void unit_prune_cgroup(Unit *u) {
1750 /* Removes the cgroup, if empty and possible, and stops watching it. */
1752 if (!u->cgroup_path)
1755 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1757 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1759 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1761 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1768 unit_release_cgroup(u);
1770 u->cgroup_realized = false;
1771 u->cgroup_realized_mask = 0;
1772 u->cgroup_enabled_mask = 0;
1775 int unit_search_main_pid(Unit *u, pid_t *ret) {
1776 _cleanup_fclose_ FILE *f = NULL;
1777 pid_t pid = 0, npid, mypid;
1783 if (!u->cgroup_path)
1786 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1790 mypid = getpid_cached();
1791 while (cg_read_pid(f, &npid) > 0) {
1797 /* Ignore processes that aren't our kids */
1798 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1802 /* Dang, there's more than one daemonized PID
1803 in this group, so we don't know what process
1804 is the main process. */
1815 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1816 _cleanup_closedir_ DIR *d = NULL;
1817 _cleanup_fclose_ FILE *f = NULL;
1823 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1829 while ((r = cg_read_pid(f, &pid)) > 0) {
1830 r = unit_watch_pid(u, pid);
1831 if (r < 0 && ret >= 0)
1835 if (r < 0 && ret >= 0)
1839 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1846 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1847 _cleanup_free_ char *p = NULL;
1849 p = strjoin(path, "/", fn);
1855 r = unit_watch_pids_in_path(u, p);
1856 if (r < 0 && ret >= 0)
1860 if (r < 0 && ret >= 0)
1867 int unit_synthesize_cgroup_empty_event(Unit *u) {
1872 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1873 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1874 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1876 if (!u->cgroup_path)
1879 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1882 if (r > 0) /* On unified we have reliable notifications, and don't need this */
1885 if (!set_isempty(u->pids))
1888 unit_add_to_cgroup_empty_queue(u);
1892 int unit_watch_all_pids(Unit *u) {
1897 /* Adds all PIDs from our cgroup to the set of PIDs we
1898 * watch. This is a fallback logic for cases where we do not
1899 * get reliable cgroup empty notifications: we try to use
1900 * SIGCHLD as replacement. */
1902 if (!u->cgroup_path)
1905 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1908 if (r > 0) /* On unified we can use proper notifications */
1911 return unit_watch_pids_in_path(u, u->cgroup_path);
1914 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
1915 Manager *m = userdata;
1922 u = m->cgroup_empty_queue;
1926 assert(u->in_cgroup_empty_queue);
1927 u->in_cgroup_empty_queue = false;
1928 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
1930 if (m->cgroup_empty_queue) {
1931 /* More stuff queued, let's make sure we remain enabled */
1932 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
1934 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
1937 unit_add_to_gc_queue(u);
1939 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1940 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1945 void unit_add_to_cgroup_empty_queue(Unit *u) {
1950 /* Note that there are four different ways how cgroup empty events reach us:
1952 * 1. On the unified hierarchy we get an inotify event on the cgroup
1954 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
1956 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
1958 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
1959 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
1961 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
1962 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
1963 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
1964 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
1965 * case for scope units). */
1967 if (u->in_cgroup_empty_queue)
1970 /* Let's verify that the cgroup is really empty */
1971 if (!u->cgroup_path)
1973 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1975 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
1981 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
1982 u->in_cgroup_empty_queue = true;
1984 /* Trigger the defer event */
1985 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
1987 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
1990 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1991 Manager *m = userdata;
1998 union inotify_event_buffer buffer;
1999 struct inotify_event *e;
2002 l = read(fd, &buffer, sizeof(buffer));
2004 if (IN_SET(errno, EINTR, EAGAIN))
2007 return log_error_errno(errno, "Failed to read control group inotify events: %m");
2010 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2014 /* Queue overflow has no watch descriptor */
2017 if (e->mask & IN_IGNORED)
2018 /* The watch was just removed */
2021 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2022 if (!u) /* Not that inotify might deliver
2023 * events for a watch even after it
2024 * was removed, because it was queued
2025 * before the removal. Let's ignore
2026 * this here safely. */
2029 unit_add_to_cgroup_empty_queue(u);
2035 int manager_setup_cgroup(Manager *m) {
2036 _cleanup_free_ char *path = NULL;
2037 const char *scope_path;
2040 #if 0 /// UNNEEDED by elogind
2046 /* 1. Determine hierarchy */
2047 m->cgroup_root = mfree(m->cgroup_root);
2048 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2049 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2051 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2054 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2056 #if 0 /// elogind does not support systemd scopes and slices
2057 /* Chop off the init scope, if we are already located in it */
2058 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2060 /* LEGACY: Also chop off the system slice if we are in
2061 * it. This is to support live upgrades from older systemd
2062 * versions where PID 1 was moved there. Also see
2063 * cg_get_root_path(). */
2064 if (!e && MANAGER_IS_SYSTEM(m)) {
2065 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2067 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2073 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2074 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2075 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2076 * easily prepend it everywhere. */
2077 delete_trailing_chars(m->cgroup_root, "/");
2080 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2082 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2084 r = cg_unified_flush();
2086 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2088 all_unified = cg_all_unified();
2089 if (all_unified < 0)
2090 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2091 if (all_unified > 0)
2092 log_debug("Unified cgroup hierarchy is located at %s.", path);
2094 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2096 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2098 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2100 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2103 #if 0 /// elogind is not init, and does not install the agent here.
2104 /* 3. Allocate cgroup empty defer event source */
2105 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2106 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2108 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2110 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2112 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2114 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2116 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2118 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2120 /* 4. Install notifier inotify object, or agent */
2121 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2123 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2125 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2126 safe_close(m->cgroup_inotify_fd);
2128 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2129 if (m->cgroup_inotify_fd < 0)
2130 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2132 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2134 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2136 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2137 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2138 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2140 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2142 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2144 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2146 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2147 * since it does not generate events when control groups with children run empty. */
2149 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2151 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2153 log_debug("Installed release agent.");
2155 log_debug("Release agent already installed.");
2158 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2159 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2160 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2163 * This method is in core, and normally called by systemd
2164 * being init. As elogind is never init, we can not install
2165 * our agent here. We do so when mounting our cgroup file
2166 * system, so only if elogind is its own tiny controller.
2167 * Further, elogind is not meant to run in systemd init scope. */
2168 if (MANAGER_IS_SYSTEM(m))
2169 // we are our own cgroup controller
2170 scope_path = strjoina("");
2171 else if (streq(m->cgroup_root, "/elogind"))
2172 // root already is our cgroup
2173 scope_path = strjoina(m->cgroup_root);
2175 // we have to create our own group
2176 scope_path = strjoina(m->cgroup_root, "/elogind");
2177 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2180 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2181 log_debug_elogind("Created control group \"%s\"", scope_path);
2183 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2184 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2185 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2187 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2190 /* 6. And pin it, so that it cannot be unmounted */
2191 safe_close(m->pin_cgroupfs_fd);
2192 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2193 if (m->pin_cgroupfs_fd < 0)
2194 return log_error_errno(errno, "Failed to open pin file: %m");
2196 /* 7. Always enable hierarchical support if it exists... */
2197 if (!all_unified && m->test_run_flags == 0)
2198 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2200 /* 8. Figure out which controllers are supported, and log about it */
2201 r = cg_mask_supported(&m->cgroup_supported);
2203 return log_error_errno(r, "Failed to determine supported controllers: %m");
2204 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2205 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2210 void manager_shutdown_cgroup(Manager *m, bool delete) {
2213 /* We can't really delete the group, since we are in it. But
2215 if (delete && m->cgroup_root)
2216 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2218 #if 0 /// elogind is not init
2219 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2221 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2223 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2224 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2227 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2229 m->cgroup_root = mfree(m->cgroup_root);
2232 #if 0 /// UNNEEDED by elogind
2233 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2240 u = hashmap_get(m->cgroup_unit, cgroup);
2244 p = strdupa(cgroup);
2248 e = strrchr(p, '/');
2250 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2254 u = hashmap_get(m->cgroup_unit, p);
2260 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2261 _cleanup_free_ char *cgroup = NULL;
2265 if (!pid_is_valid(pid))
2268 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2271 return manager_get_unit_by_cgroup(m, cgroup);
2274 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2279 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2280 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2281 * relevant one as children of the process will be assigned to that one, too, before all else. */
2283 if (!pid_is_valid(pid))
2286 if (pid == getpid_cached())
2287 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2289 u = manager_get_unit_by_pid_cgroup(m, pid);
2293 u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2297 array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2305 #if 0 /// elogind must substitute this with its own variant
2306 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2312 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2313 * or from the --system instance */
2315 log_debug("Got cgroup empty notification for: %s", cgroup);
2317 u = manager_get_unit_by_cgroup(m, cgroup);
2321 unit_add_to_cgroup_empty_queue(u);
2325 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2331 log_debug("Got cgroup empty notification for: %s", cgroup);
2333 s = hashmap_get(m->sessions, cgroup);
2336 session_finalize(s);
2339 log_warning("Session not found: %s", cgroup);
2344 #if 0 /// UNNEEDED by elogind
2345 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2346 _cleanup_free_ char *v = NULL;
2352 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2355 if (!u->cgroup_path)
2358 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2359 if (unit_has_root_cgroup(u))
2360 return procfs_memory_get_current(ret);
2362 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2365 r = cg_all_unified();
2369 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2371 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2377 return safe_atou64(v, ret);
2380 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2381 _cleanup_free_ char *v = NULL;
2387 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2390 if (!u->cgroup_path)
2393 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2394 if (unit_has_root_cgroup(u))
2395 return procfs_tasks_get_current(ret);
2397 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2400 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2406 return safe_atou64(v, ret);
2409 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2410 _cleanup_free_ char *v = NULL;
2417 if (!u->cgroup_path)
2420 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2421 if (unit_has_root_cgroup(u))
2422 return procfs_cpu_get_usage(ret);
2424 r = cg_all_unified();
2428 _cleanup_free_ char *val = NULL;
2431 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2434 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
2437 if (IN_SET(r, -ENOENT, -ENXIO))
2440 r = safe_atou64(val, &us);
2444 ns = us * NSEC_PER_USEC;
2446 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2449 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2455 r = safe_atou64(v, &ns);
2464 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2470 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2471 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2472 * call this function with a NULL return value. */
2474 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2477 r = unit_get_cpu_usage_raw(u, &ns);
2478 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2479 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2483 *ret = u->cpu_usage_last;
2489 if (ns > u->cpu_usage_base)
2490 ns -= u->cpu_usage_base;
2494 u->cpu_usage_last = ns;
2501 int unit_get_ip_accounting(
2503 CGroupIPAccountingMetric metric,
2510 assert(metric >= 0);
2511 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2514 /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
2515 * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
2516 * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
2518 if (u->type == UNIT_SLICE)
2521 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2524 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2525 u->ip_accounting_ingress_map_fd :
2526 u->ip_accounting_egress_map_fd;
2530 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2531 r = bpf_firewall_read_accounting(fd, &value, NULL);
2533 r = bpf_firewall_read_accounting(fd, NULL, &value);
2537 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2538 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2539 * ip_accounting_extra[] field, and add them in here transparently. */
2541 *ret = value + u->ip_accounting_extra[metric];
2546 int unit_reset_cpu_accounting(Unit *u) {
2552 u->cpu_usage_last = NSEC_INFINITY;
2554 r = unit_get_cpu_usage_raw(u, &ns);
2556 u->cpu_usage_base = 0;
2560 u->cpu_usage_base = ns;
2564 int unit_reset_ip_accounting(Unit *u) {
2569 if (u->ip_accounting_ingress_map_fd >= 0)
2570 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2572 if (u->ip_accounting_egress_map_fd >= 0)
2573 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2575 zero(u->ip_accounting_extra);
2577 return r < 0 ? r : q;
2580 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2583 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2589 /* always invalidate compat pairs together */
2590 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2591 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2593 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2594 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2596 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2599 u->cgroup_realized_mask &= ~m;
2600 unit_add_to_cgroup_realize_queue(u);
2603 void unit_invalidate_cgroup_bpf(Unit *u) {
2606 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2609 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2612 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2613 unit_add_to_cgroup_realize_queue(u);
2615 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2616 * list of our children includes our own. */
2617 if (u->type == UNIT_SLICE) {
2622 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2626 if (UNIT_DEREF(member->slice) != u)
2629 unit_invalidate_cgroup_bpf(member);
2634 void manager_invalidate_startup_units(Manager *m) {
2640 SET_FOREACH(u, m->startup_units, i)
2641 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2644 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2645 [CGROUP_AUTO] = "auto",
2646 [CGROUP_CLOSED] = "closed",
2647 [CGROUP_STRICT] = "strict",
2650 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);