1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 //#include "blockdev-util.h"
26 //#include "bpf-firewall.h"
27 //#include "bus-error.h"
28 #include "cgroup-util.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "process-util.h"
36 //#include "procfs-util.h"
37 //#include "special.h"
38 #include "stdio-util.h"
39 #include "string-table.h"
40 #include "string-util.h"
43 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
45 bool manager_owns_root_cgroup(Manager *m) {
48 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
49 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
50 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
51 * we run in any kind of container virtualization. */
53 if (detect_container() > 0)
56 return isempty(m->cgroup_root) || path_equal(m->cgroup_root, "/");
59 #if 0 /// UNNEEDED by elogind
60 bool unit_has_root_cgroup(Unit *u) {
63 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
64 * the manager manages the root cgroup. */
66 if (!manager_owns_root_cgroup(u->manager))
69 return unit_has_name(u, SPECIAL_ROOT_SLICE);
72 static void cgroup_compat_warn(void) {
73 static bool cgroup_compat_warned = false;
75 if (cgroup_compat_warned)
78 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
79 "See cgroup-compat debug messages for details.");
81 cgroup_compat_warned = true;
84 #define log_cgroup_compat(unit, fmt, ...) do { \
85 cgroup_compat_warn(); \
86 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
89 void cgroup_context_init(CGroupContext *c) {
92 /* Initialize everything to the kernel defaults, assuming the
93 * structure is preinitialized to 0 */
95 c->cpu_weight = CGROUP_WEIGHT_INVALID;
96 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
97 c->cpu_quota_per_sec_usec = USEC_INFINITY;
99 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
100 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
102 c->memory_high = CGROUP_LIMIT_MAX;
103 c->memory_max = CGROUP_LIMIT_MAX;
104 c->memory_swap_max = CGROUP_LIMIT_MAX;
106 c->memory_limit = CGROUP_LIMIT_MAX;
108 c->io_weight = CGROUP_WEIGHT_INVALID;
109 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
111 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
112 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
114 c->tasks_max = (uint64_t) -1;
117 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
121 LIST_REMOVE(device_allow, c->device_allow, a);
126 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
130 LIST_REMOVE(device_weights, c->io_device_weights, w);
135 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
139 LIST_REMOVE(device_limits, c->io_device_limits, l);
144 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
148 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
153 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
157 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
162 void cgroup_context_done(CGroupContext *c) {
165 while (c->io_device_weights)
166 cgroup_context_free_io_device_weight(c, c->io_device_weights);
168 while (c->io_device_limits)
169 cgroup_context_free_io_device_limit(c, c->io_device_limits);
171 while (c->blockio_device_weights)
172 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
174 while (c->blockio_device_bandwidths)
175 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
177 while (c->device_allow)
178 cgroup_context_free_device_allow(c, c->device_allow);
180 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
181 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
184 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
185 CGroupIODeviceLimit *il;
186 CGroupIODeviceWeight *iw;
187 CGroupBlockIODeviceBandwidth *b;
188 CGroupBlockIODeviceWeight *w;
189 CGroupDeviceAllow *a;
190 IPAddressAccessItem *iaai;
191 char u[FORMAT_TIMESPAN_MAX];
196 prefix = strempty(prefix);
199 "%sCPUAccounting=%s\n"
200 "%sIOAccounting=%s\n"
201 "%sBlockIOAccounting=%s\n"
202 "%sMemoryAccounting=%s\n"
203 "%sTasksAccounting=%s\n"
204 "%sIPAccounting=%s\n"
205 "%sCPUWeight=%" PRIu64 "\n"
206 "%sStartupCPUWeight=%" PRIu64 "\n"
207 "%sCPUShares=%" PRIu64 "\n"
208 "%sStartupCPUShares=%" PRIu64 "\n"
209 "%sCPUQuotaPerSecSec=%s\n"
210 "%sIOWeight=%" PRIu64 "\n"
211 "%sStartupIOWeight=%" PRIu64 "\n"
212 "%sBlockIOWeight=%" PRIu64 "\n"
213 "%sStartupBlockIOWeight=%" PRIu64 "\n"
214 "%sMemoryLow=%" PRIu64 "\n"
215 "%sMemoryHigh=%" PRIu64 "\n"
216 "%sMemoryMax=%" PRIu64 "\n"
217 "%sMemorySwapMax=%" PRIu64 "\n"
218 "%sMemoryLimit=%" PRIu64 "\n"
219 "%sTasksMax=%" PRIu64 "\n"
220 "%sDevicePolicy=%s\n"
222 prefix, yes_no(c->cpu_accounting),
223 prefix, yes_no(c->io_accounting),
224 prefix, yes_no(c->blockio_accounting),
225 prefix, yes_no(c->memory_accounting),
226 prefix, yes_no(c->tasks_accounting),
227 prefix, yes_no(c->ip_accounting),
228 prefix, c->cpu_weight,
229 prefix, c->startup_cpu_weight,
230 prefix, c->cpu_shares,
231 prefix, c->startup_cpu_shares,
232 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
233 prefix, c->io_weight,
234 prefix, c->startup_io_weight,
235 prefix, c->blockio_weight,
236 prefix, c->startup_blockio_weight,
237 prefix, c->memory_low,
238 prefix, c->memory_high,
239 prefix, c->memory_max,
240 prefix, c->memory_swap_max,
241 prefix, c->memory_limit,
242 prefix, c->tasks_max,
243 prefix, cgroup_device_policy_to_string(c->device_policy),
244 prefix, yes_no(c->delegate));
247 _cleanup_free_ char *t = NULL;
249 (void) cg_mask_to_string(c->delegate_controllers, &t);
251 fprintf(f, "%sDelegateControllers=%s\n",
256 LIST_FOREACH(device_allow, a, c->device_allow)
258 "%sDeviceAllow=%s %s%s%s\n",
261 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
263 LIST_FOREACH(device_weights, iw, c->io_device_weights)
265 "%sIODeviceWeight=%s %" PRIu64,
270 LIST_FOREACH(device_limits, il, c->io_device_limits) {
271 char buf[FORMAT_BYTES_MAX];
272 CGroupIOLimitType type;
274 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
275 if (il->limits[type] != cgroup_io_limit_defaults[type])
279 cgroup_io_limit_type_to_string(type),
281 format_bytes(buf, sizeof(buf), il->limits[type]));
284 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
286 "%sBlockIODeviceWeight=%s %" PRIu64,
291 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
292 char buf[FORMAT_BYTES_MAX];
294 if (b->rbps != CGROUP_LIMIT_MAX)
296 "%sBlockIOReadBandwidth=%s %s\n",
299 format_bytes(buf, sizeof(buf), b->rbps));
300 if (b->wbps != CGROUP_LIMIT_MAX)
302 "%sBlockIOWriteBandwidth=%s %s\n",
305 format_bytes(buf, sizeof(buf), b->wbps));
308 LIST_FOREACH(items, iaai, c->ip_address_allow) {
309 _cleanup_free_ char *k = NULL;
311 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
312 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
315 LIST_FOREACH(items, iaai, c->ip_address_deny) {
316 _cleanup_free_ char *k = NULL;
318 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
319 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
323 static int lookup_block_device(const char *p, dev_t *dev) {
332 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
334 if (S_ISBLK(st.st_mode))
336 else if (major(st.st_dev) != 0) {
337 /* If this is not a device node then find the block
338 * device this file is stored on */
341 /* If this is a partition, try to get the originating
343 (void) block_get_whole_disk(*dev, dev);
345 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
352 static int whitelist_device(const char *path, const char *node, const char *acc) {
353 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
355 bool ignore_notfound;
361 if (node[0] == '-') {
362 /* Non-existent paths starting with "-" must be silently ignored */
364 ignore_notfound = true;
366 ignore_notfound = false;
368 if (stat(node, &st) < 0) {
369 if (errno == ENOENT && ignore_notfound)
372 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
375 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
376 log_warning("%s is not a device.", node);
382 S_ISCHR(st.st_mode) ? 'c' : 'b',
383 major(st.st_rdev), minor(st.st_rdev),
386 r = cg_set_attribute("devices", path, "devices.allow", buf);
388 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
389 "Failed to set devices.allow on %s: %m", path);
394 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
395 _cleanup_fclose_ FILE *f = NULL;
402 assert(IN_SET(type, 'b', 'c'));
404 f = fopen("/proc/devices", "re");
406 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
408 FOREACH_LINE(line, f, goto fail) {
409 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
414 if (type == 'c' && streq(line, "Character devices:")) {
419 if (type == 'b' && streq(line, "Block devices:")) {
434 w = strpbrk(p, WHITESPACE);
439 r = safe_atou(p, &maj);
446 w += strspn(w, WHITESPACE);
448 if (fnmatch(name, w, 0) != 0)
457 r = cg_set_attribute("devices", path, "devices.allow", buf);
459 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
460 "Failed to set devices.allow on %s: %m", path);
466 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
469 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
470 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
471 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
474 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
475 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
476 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
479 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
480 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
481 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
482 return c->startup_cpu_weight;
483 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
484 return c->cpu_weight;
486 return CGROUP_WEIGHT_DEFAULT;
489 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
490 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
491 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
492 return c->startup_cpu_shares;
493 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
494 return c->cpu_shares;
496 return CGROUP_CPU_SHARES_DEFAULT;
499 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
500 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
503 xsprintf(buf, "%" PRIu64 "\n", weight);
504 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
506 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
507 "Failed to set cpu.weight: %m");
509 if (quota != USEC_INFINITY)
510 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
511 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
513 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
515 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
518 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
519 "Failed to set cpu.max: %m");
522 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
523 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
526 xsprintf(buf, "%" PRIu64 "\n", shares);
527 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
529 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
530 "Failed to set cpu.shares: %m");
532 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
533 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
535 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
536 "Failed to set cpu.cfs_period_us: %m");
538 if (quota != USEC_INFINITY) {
539 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
540 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
542 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
544 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
545 "Failed to set cpu.cfs_quota_us: %m");
548 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
549 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
550 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
553 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
554 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
555 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
558 static bool cgroup_context_has_io_config(CGroupContext *c) {
559 return c->io_accounting ||
560 c->io_weight != CGROUP_WEIGHT_INVALID ||
561 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
562 c->io_device_weights ||
566 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
567 return c->blockio_accounting ||
568 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
569 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
570 c->blockio_device_weights ||
571 c->blockio_device_bandwidths;
574 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
575 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
576 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
577 return c->startup_io_weight;
578 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
581 return CGROUP_WEIGHT_DEFAULT;
584 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
585 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
586 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
587 return c->startup_blockio_weight;
588 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
589 return c->blockio_weight;
591 return CGROUP_BLKIO_WEIGHT_DEFAULT;
594 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
595 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
596 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
599 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
600 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
601 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
604 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
605 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
609 r = lookup_block_device(dev_path, &dev);
613 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
614 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
616 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
617 "Failed to set io.weight: %m");
620 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
621 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
625 r = lookup_block_device(dev_path, &dev);
629 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
630 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
632 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
633 "Failed to set blkio.weight_device: %m");
636 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
637 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
638 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
639 CGroupIOLimitType type;
644 r = lookup_block_device(dev_path, &dev);
648 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
649 if (limits[type] != cgroup_io_limit_defaults[type]) {
650 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
653 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
657 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
658 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
659 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
660 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
662 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
663 "Failed to set io.max: %m");
667 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
668 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
673 r = lookup_block_device(dev_path, &dev);
677 if (rbps != CGROUP_LIMIT_MAX)
679 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
680 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
682 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
683 "Failed to set blkio.throttle.read_bps_device: %m");
685 if (wbps != CGROUP_LIMIT_MAX)
687 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
688 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
690 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
691 "Failed to set blkio.throttle.write_bps_device: %m");
696 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
697 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
700 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
701 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
704 if (v != CGROUP_LIMIT_MAX)
705 xsprintf(buf, "%" PRIu64 "\n", v);
707 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
709 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
710 "Failed to set %s: %m", file);
713 static void cgroup_apply_firewall(Unit *u) {
716 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
718 if (bpf_firewall_compile(u) < 0)
721 (void) bpf_firewall_install(u);
724 static void cgroup_context_apply(
726 CGroupMask apply_mask,
728 ManagerState state) {
737 /* Nothing to do? Exit early! */
738 if (apply_mask == 0 && !apply_bpf)
741 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
742 is_root = unit_has_root_cgroup(u);
744 assert_se(c = unit_get_cgroup_context(u));
745 assert_se(path = u->cgroup_path);
747 if (is_root) /* Make sure we don't try to display messages with an empty path. */
750 /* We generally ignore errors caused by read-only mounted
751 * cgroup trees (assuming we are running in a container then),
752 * and missing cgroups, i.e. EROFS and ENOENT. */
754 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
755 bool has_weight, has_shares;
757 has_weight = cgroup_context_has_cpu_weight(c);
758 has_shares = cgroup_context_has_cpu_shares(c);
760 if (cg_all_unified() > 0) {
764 weight = cgroup_context_cpu_weight(c, state);
765 else if (has_shares) {
766 uint64_t shares = cgroup_context_cpu_shares(c, state);
768 weight = cgroup_cpu_shares_to_weight(shares);
770 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
771 shares, weight, path);
773 weight = CGROUP_WEIGHT_DEFAULT;
775 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
780 uint64_t weight = cgroup_context_cpu_weight(c, state);
782 shares = cgroup_cpu_weight_to_shares(weight);
784 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
785 weight, shares, path);
786 } else if (has_shares)
787 shares = cgroup_context_cpu_shares(c, state);
789 shares = CGROUP_CPU_SHARES_DEFAULT;
791 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
795 if (apply_mask & CGROUP_MASK_IO) {
796 bool has_io = cgroup_context_has_io_config(c);
797 bool has_blockio = cgroup_context_has_blockio_config(c);
800 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
804 weight = cgroup_context_io_weight(c, state);
805 else if (has_blockio) {
806 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
808 weight = cgroup_weight_blkio_to_io(blkio_weight);
810 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
811 blkio_weight, weight);
813 weight = CGROUP_WEIGHT_DEFAULT;
815 xsprintf(buf, "default %" PRIu64 "\n", weight);
816 r = cg_set_attribute("io", path, "io.weight", buf);
818 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
819 "Failed to set io.weight: %m");
822 CGroupIODeviceWeight *w;
824 /* FIXME: no way to reset this list */
825 LIST_FOREACH(device_weights, w, c->io_device_weights)
826 cgroup_apply_io_device_weight(u, w->path, w->weight);
827 } else if (has_blockio) {
828 CGroupBlockIODeviceWeight *w;
830 /* FIXME: no way to reset this list */
831 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
832 weight = cgroup_weight_blkio_to_io(w->weight);
834 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
835 w->weight, weight, w->path);
837 cgroup_apply_io_device_weight(u, w->path, weight);
842 /* Apply limits and free ones without config. */
844 CGroupIODeviceLimit *l, *next;
846 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
847 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
848 cgroup_context_free_io_device_limit(c, l);
850 } else if (has_blockio) {
851 CGroupBlockIODeviceBandwidth *b, *next;
853 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
854 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
855 CGroupIOLimitType type;
857 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
858 limits[type] = cgroup_io_limit_defaults[type];
860 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
861 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
863 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
864 b->rbps, b->wbps, b->path);
866 if (!cgroup_apply_io_device_limit(u, b->path, limits))
867 cgroup_context_free_blockio_device_bandwidth(c, b);
872 if (apply_mask & CGROUP_MASK_BLKIO) {
873 bool has_io = cgroup_context_has_io_config(c);
874 bool has_blockio = cgroup_context_has_blockio_config(c);
877 char buf[DECIMAL_STR_MAX(uint64_t)+1];
881 uint64_t io_weight = cgroup_context_io_weight(c, state);
883 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
885 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
887 } else if (has_blockio)
888 weight = cgroup_context_blkio_weight(c, state);
890 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
892 xsprintf(buf, "%" PRIu64 "\n", weight);
893 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
895 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
896 "Failed to set blkio.weight: %m");
899 CGroupIODeviceWeight *w;
901 /* FIXME: no way to reset this list */
902 LIST_FOREACH(device_weights, w, c->io_device_weights) {
903 weight = cgroup_weight_io_to_blkio(w->weight);
905 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
906 w->weight, weight, w->path);
908 cgroup_apply_blkio_device_weight(u, w->path, weight);
910 } else if (has_blockio) {
911 CGroupBlockIODeviceWeight *w;
913 /* FIXME: no way to reset this list */
914 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
915 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
919 /* Apply limits and free ones without config. */
921 CGroupIODeviceLimit *l, *next;
923 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
924 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
925 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
927 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
928 cgroup_context_free_io_device_limit(c, l);
930 } else if (has_blockio) {
931 CGroupBlockIODeviceBandwidth *b, *next;
933 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
934 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
935 cgroup_context_free_blockio_device_bandwidth(c, b);
939 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
940 if (cg_all_unified() > 0) {
941 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
943 if (cgroup_context_has_unified_memory_config(c)) {
945 swap_max = c->memory_swap_max;
947 max = c->memory_limit;
949 if (max != CGROUP_LIMIT_MAX)
950 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
953 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
954 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
955 cgroup_apply_unified_memory_limit(u, "memory.max", max);
956 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
958 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
961 if (cgroup_context_has_unified_memory_config(c)) {
963 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
965 val = c->memory_limit;
967 if (val == CGROUP_LIMIT_MAX)
968 strncpy(buf, "-1\n", sizeof(buf));
970 xsprintf(buf, "%" PRIu64 "\n", val);
972 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
974 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
975 "Failed to set memory.limit_in_bytes: %m");
979 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
980 CGroupDeviceAllow *a;
982 /* Changing the devices list of a populated cgroup
983 * might result in EINVAL, hence ignore EINVAL
986 if (c->device_allow || c->device_policy != CGROUP_AUTO)
987 r = cg_set_attribute("devices", path, "devices.deny", "a");
989 r = cg_set_attribute("devices", path, "devices.allow", "a");
991 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
992 "Failed to reset devices.list: %m");
994 if (c->device_policy == CGROUP_CLOSED ||
995 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
996 static const char auto_devices[] =
997 "/dev/null\0" "rwm\0"
998 "/dev/zero\0" "rwm\0"
999 "/dev/full\0" "rwm\0"
1000 "/dev/random\0" "rwm\0"
1001 "/dev/urandom\0" "rwm\0"
1002 "/dev/tty\0" "rwm\0"
1003 "/dev/ptmx\0" "rwm\0"
1004 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1005 "-/run/systemd/inaccessible/chr\0" "rwm\0"
1006 "-/run/systemd/inaccessible/blk\0" "rwm\0";
1010 NULSTR_FOREACH_PAIR(x, y, auto_devices)
1011 whitelist_device(path, x, y);
1013 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1014 whitelist_major(path, "pts", 'c', "rw");
1017 LIST_FOREACH(device_allow, a, c->device_allow) {
1033 if (path_startswith(a->path, "/dev/"))
1034 whitelist_device(path, a->path, acc);
1035 else if ((val = startswith(a->path, "block-")))
1036 whitelist_major(path, val, 'b', acc);
1037 else if ((val = startswith(a->path, "char-")))
1038 whitelist_major(path, val, 'c', acc);
1040 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1044 if (apply_mask & CGROUP_MASK_PIDS) {
1047 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1048 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1049 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1050 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1051 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1052 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1053 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1054 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1055 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1056 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1057 * which is desirable so that there's an offical way to release control of the sysctl from
1058 * systemd: set the limit to unbounded and reload. */
1060 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1061 u->manager->sysctl_pid_max_changed = true;
1062 r = procfs_tasks_set_limit(c->tasks_max);
1063 } else if (u->manager->sysctl_pid_max_changed)
1064 r = procfs_tasks_set_limit(TASKS_MAX);
1069 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070 "Failed to write to tasks limit sysctls: %m");
1073 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1074 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1076 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1077 r = cg_set_attribute("pids", path, "pids.max", buf);
1079 r = cg_set_attribute("pids", path, "pids.max", "max");
1081 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1082 "Failed to set pids.max: %m");
1087 cgroup_apply_firewall(u);
1090 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1091 CGroupMask mask = 0;
1093 /* Figure out which controllers we need */
1095 if (c->cpu_accounting ||
1096 cgroup_context_has_cpu_weight(c) ||
1097 cgroup_context_has_cpu_shares(c) ||
1098 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1099 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1101 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1102 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1104 if (c->memory_accounting ||
1105 c->memory_limit != CGROUP_LIMIT_MAX ||
1106 cgroup_context_has_unified_memory_config(c))
1107 mask |= CGROUP_MASK_MEMORY;
1109 if (c->device_allow ||
1110 c->device_policy != CGROUP_AUTO)
1111 mask |= CGROUP_MASK_DEVICES;
1113 if (c->tasks_accounting ||
1114 c->tasks_max != CGROUP_LIMIT_MAX)
1115 mask |= CGROUP_MASK_PIDS;
1120 CGroupMask unit_get_own_mask(Unit *u) {
1123 /* Returns the mask of controllers the unit needs for itself */
1125 c = unit_get_cgroup_context(u);
1129 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1132 CGroupMask unit_get_delegate_mask(Unit *u) {
1135 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1136 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1138 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1140 if (!unit_cgroup_delegate(u))
1143 if (cg_all_unified() <= 0) {
1146 e = unit_get_exec_context(u);
1147 if (e && !exec_context_maintains_privileges(e))
1151 assert_se(c = unit_get_cgroup_context(u));
1152 return c->delegate_controllers;
1155 CGroupMask unit_get_members_mask(Unit *u) {
1158 /* Returns the mask of controllers all of the unit's children require, merged */
1160 if (u->cgroup_members_mask_valid)
1161 return u->cgroup_members_mask;
1163 u->cgroup_members_mask = 0;
1165 if (u->type == UNIT_SLICE) {
1170 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1175 if (UNIT_DEREF(member->slice) != u)
1178 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1182 u->cgroup_members_mask_valid = true;
1183 return u->cgroup_members_mask;
1186 CGroupMask unit_get_siblings_mask(Unit *u) {
1189 /* Returns the mask of controllers all of the unit's siblings
1190 * require, i.e. the members mask of the unit's parent slice
1191 * if there is one. */
1193 if (UNIT_ISSET(u->slice))
1194 return unit_get_members_mask(UNIT_DEREF(u->slice));
1196 return unit_get_subtree_mask(u); /* we are the top-level slice */
1199 CGroupMask unit_get_subtree_mask(Unit *u) {
1201 /* Returns the mask of this subtree, meaning of the group
1202 * itself and its children. */
1204 return unit_get_own_mask(u) | unit_get_members_mask(u);
1207 CGroupMask unit_get_target_mask(Unit *u) {
1210 /* This returns the cgroup mask of all controllers to enable
1211 * for a specific cgroup, i.e. everything it needs itself,
1212 * plus all that its children need, plus all that its siblings
1213 * need. This is primarily useful on the legacy cgroup
1214 * hierarchy, where we need to duplicate each cgroup in each
1215 * hierarchy that shall be enabled for it. */
1217 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1218 mask &= u->manager->cgroup_supported;
1223 CGroupMask unit_get_enable_mask(Unit *u) {
1226 /* This returns the cgroup mask of all controllers to enable
1227 * for the children of a specific cgroup. This is primarily
1228 * useful for the unified cgroup hierarchy, where each cgroup
1229 * controls which controllers are enabled for its children. */
1231 mask = unit_get_members_mask(u);
1232 mask &= u->manager->cgroup_supported;
1237 bool unit_get_needs_bpf(Unit *u) {
1242 c = unit_get_cgroup_context(u);
1246 if (c->ip_accounting ||
1247 c->ip_address_allow ||
1251 /* If any parent slice has an IP access list defined, it applies too */
1252 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1253 c = unit_get_cgroup_context(p);
1257 if (c->ip_address_allow ||
1265 /* Recurse from a unit up through its containing slices, propagating
1266 * mask bits upward. A unit is also member of itself. */
1267 void unit_update_cgroup_members_masks(Unit *u) {
1273 /* Calculate subtree mask */
1274 m = unit_get_subtree_mask(u);
1276 /* See if anything changed from the previous invocation. If
1277 * not, we're done. */
1278 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1282 u->cgroup_subtree_mask_valid &&
1283 ((m & ~u->cgroup_subtree_mask) != 0) &&
1284 ((~m & u->cgroup_subtree_mask) == 0);
1286 u->cgroup_subtree_mask = m;
1287 u->cgroup_subtree_mask_valid = true;
1289 if (UNIT_ISSET(u->slice)) {
1290 Unit *s = UNIT_DEREF(u->slice);
1293 /* There's more set now than before. We
1294 * propagate the new mask to the parent's mask
1295 * (not caring if it actually was valid or
1298 s->cgroup_members_mask |= m;
1301 /* There's less set now than before (or we
1302 * don't know), we need to recalculate
1303 * everything, so let's invalidate the
1304 * parent's members mask */
1306 s->cgroup_members_mask_valid = false;
1308 /* And now make sure that this change also hits our
1310 unit_update_cgroup_members_masks(s);
1314 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1316 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1320 if (u->cgroup_path &&
1321 u->cgroup_realized &&
1322 (u->cgroup_realized_mask & mask) == mask)
1323 return u->cgroup_path;
1325 u = UNIT_DEREF(u->slice);
1331 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1332 return unit_get_realized_cgroup_path(userdata, mask);
1335 char *unit_default_cgroup_path(Unit *u) {
1336 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1341 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1342 return strdup(u->manager->cgroup_root);
1344 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1345 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1350 escaped = cg_escape(u->id);
1355 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1358 return strjoin(u->manager->cgroup_root, "/", escaped);
1361 int unit_set_cgroup_path(Unit *u, const char *path) {
1362 _cleanup_free_ char *p = NULL;
1374 if (streq_ptr(u->cgroup_path, p))
1378 r = hashmap_put(u->manager->cgroup_unit, p, u);
1383 unit_release_cgroup(u);
1391 int unit_watch_cgroup(Unit *u) {
1392 _cleanup_free_ char *events = NULL;
1397 if (!u->cgroup_path)
1400 if (u->cgroup_inotify_wd >= 0)
1403 /* Only applies to the unified hierarchy */
1404 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1406 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1410 /* Don't watch the root slice, it's pointless. */
1411 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1414 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1418 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1422 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1423 if (u->cgroup_inotify_wd < 0) {
1425 if (errno == ENOENT) /* If the directory is already
1426 * gone we don't need to track
1427 * it, so this is not an error */
1430 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1433 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1435 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1440 int unit_pick_cgroup_path(Unit *u) {
1441 _cleanup_free_ char *path = NULL;
1449 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1452 path = unit_default_cgroup_path(u);
1456 r = unit_set_cgroup_path(u, path);
1458 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1460 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1465 static int unit_create_cgroup(
1467 CGroupMask target_mask,
1468 CGroupMask enable_mask,
1476 c = unit_get_cgroup_context(u);
1480 /* Figure out our cgroup path */
1481 r = unit_pick_cgroup_path(u);
1485 /* First, create our own group */
1486 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1488 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1490 /* Start watching it */
1491 (void) unit_watch_cgroup(u);
1493 /* Enable all controllers we need */
1494 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1496 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1498 /* Keep track that this is now realized */
1499 u->cgroup_realized = true;
1500 u->cgroup_realized_mask = target_mask;
1501 u->cgroup_enabled_mask = enable_mask;
1502 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1504 if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1506 /* Then, possibly move things over, but not if
1507 * subgroups may contain processes, which is the case
1508 * for slice and delegation units. */
1509 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1511 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1517 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1518 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1524 if (MANAGER_IS_SYSTEM(u->manager))
1527 if (!u->manager->system_bus)
1530 if (!u->cgroup_path)
1533 /* Determine this unit's cgroup path relative to our cgroup root */
1534 pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1538 pp = strjoina("/", pp, suffix_path);
1539 path_kill_slashes(pp);
1541 r = sd_bus_call_method(u->manager->system_bus,
1542 "org.freedesktop.systemd1",
1543 "/org/freedesktop/systemd1",
1544 "org.freedesktop.systemd1.Manager",
1545 "AttachProcessesToUnit",
1548 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1550 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1555 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1556 CGroupMask delegated_mask;
1564 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1567 if (set_isempty(pids))
1570 r = unit_realize_cgroup(u);
1574 if (isempty(suffix_path))
1577 p = strjoina(u->cgroup_path, "/", suffix_path);
1579 delegated_mask = unit_get_delegate_mask(u);
1582 SET_FOREACH(pidp, pids, i) {
1583 pid_t pid = PTR_TO_PID(pidp);
1586 /* First, attach the PID to the main cgroup hierarchy */
1587 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1589 log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1591 if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1594 /* If we are in a user instance, and we can't move the process ourselves due to
1595 * permission problems, let's ask the system instance about it instead. Since it's more
1596 * privileged it might be able to move the process across the leaves of a subtree who's
1597 * top node is not owned by us. */
1599 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1601 log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1603 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1607 r = q; /* Remember first error */
1612 q = cg_all_unified();
1618 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1619 * innermost realized one */
1621 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1622 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1623 const char *realized;
1625 if (!(u->manager->cgroup_supported & bit))
1628 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1629 if (delegated_mask & u->cgroup_realized_mask & bit) {
1630 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1632 continue; /* Success! */
1634 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1635 pid, p, cgroup_controller_to_string(c));
1638 /* So this controller is either not delegate or realized, or something else weird happened. In
1639 * that case let's attach the PID at least to the closest cgroup up the tree that is
1641 realized = unit_get_realized_cgroup_path(u, bit);
1643 continue; /* Not even realized in the root slice? Then let's not bother */
1645 q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1647 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1648 pid, realized, cgroup_controller_to_string(c));
1655 static void cgroup_xattr_apply(Unit *u) {
1656 char ids[SD_ID128_STRING_MAX];
1661 if (!MANAGER_IS_SYSTEM(u->manager))
1664 if (sd_id128_is_null(u->invocation_id))
1667 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1668 "trusted.invocation_id",
1669 sd_id128_to_string(u->invocation_id, ids), 32,
1672 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1675 static bool unit_has_mask_realized(
1677 CGroupMask target_mask,
1678 CGroupMask enable_mask,
1683 return u->cgroup_realized &&
1684 u->cgroup_realized_mask == target_mask &&
1685 u->cgroup_enabled_mask == enable_mask &&
1686 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1687 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1690 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1693 if (u->in_cgroup_realize_queue)
1696 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1697 u->in_cgroup_realize_queue = true;
1700 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1703 if (!u->in_cgroup_realize_queue)
1706 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1707 u->in_cgroup_realize_queue = false;
1711 /* Check if necessary controllers and attributes for a unit are in place.
1713 * If so, do nothing.
1714 * If not, create paths, move processes over, and set attributes.
1716 * Returns 0 on success and < 0 on failure. */
1717 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1718 CGroupMask target_mask, enable_mask;
1719 bool needs_bpf, apply_bpf;
1724 unit_remove_from_cgroup_realize_queue(u);
1726 target_mask = unit_get_target_mask(u);
1727 enable_mask = unit_get_enable_mask(u);
1728 needs_bpf = unit_get_needs_bpf(u);
1730 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1733 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1734 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1735 * this will trickle down properly to cgroupfs. */
1736 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1738 /* First, realize parents */
1739 if (UNIT_ISSET(u->slice)) {
1740 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1745 /* And then do the real work */
1746 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1750 /* Finally, apply the necessary attributes. */
1751 cgroup_context_apply(u, target_mask, apply_bpf, state);
1752 cgroup_xattr_apply(u);
1757 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1765 state = manager_state(m);
1767 while ((i = m->cgroup_realize_queue)) {
1768 assert(i->in_cgroup_realize_queue);
1770 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1771 /* Maybe things changed, and the unit is not actually active anymore? */
1772 unit_remove_from_cgroup_realize_queue(i);
1776 r = unit_realize_cgroup_now(i, state);
1778 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1786 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1789 /* This adds the siblings of the specified unit and the
1790 * siblings of all parent units to the cgroup queue. (But
1791 * neither the specified unit itself nor the parents.) */
1793 while ((slice = UNIT_DEREF(u->slice))) {
1798 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1802 /* Skip units that have a dependency on the slice
1803 * but aren't actually in it. */
1804 if (UNIT_DEREF(m->slice) != slice)
1807 /* No point in doing cgroup application for units
1808 * without active processes. */
1809 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1812 /* If the unit doesn't need any new controllers
1813 * and has current ones realized, it doesn't need
1815 if (unit_has_mask_realized(m,
1816 unit_get_target_mask(m),
1817 unit_get_enable_mask(m),
1818 unit_get_needs_bpf(m)))
1821 unit_add_to_cgroup_realize_queue(m);
1828 int unit_realize_cgroup(Unit *u) {
1831 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1834 /* So, here's the deal: when realizing the cgroups for this
1835 * unit, we need to first create all parents, but there's more
1836 * actually: for the weight-based controllers we also need to
1837 * make sure that all our siblings (i.e. units that are in the
1838 * same slice as we are) have cgroups, too. Otherwise, things
1839 * would become very uneven as each of their processes would
1840 * get as much resources as all our group together. This call
1841 * will synchronously create the parent cgroups, but will
1842 * defer work on the siblings to the next event loop
1845 /* Add all sibling slices to the cgroup queue. */
1846 unit_add_siblings_to_cgroup_realize_queue(u);
1848 /* And realize this one now (and apply the values) */
1849 return unit_realize_cgroup_now(u, manager_state(u->manager));
1852 void unit_release_cgroup(Unit *u) {
1855 /* Forgets all cgroup details for this cgroup */
1857 if (u->cgroup_path) {
1858 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1859 u->cgroup_path = mfree(u->cgroup_path);
1862 if (u->cgroup_inotify_wd >= 0) {
1863 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1864 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1866 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1867 u->cgroup_inotify_wd = -1;
1871 void unit_prune_cgroup(Unit *u) {
1877 /* Removes the cgroup, if empty and possible, and stops watching it. */
1879 if (!u->cgroup_path)
1882 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1884 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1886 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1888 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1895 unit_release_cgroup(u);
1897 u->cgroup_realized = false;
1898 u->cgroup_realized_mask = 0;
1899 u->cgroup_enabled_mask = 0;
1902 int unit_search_main_pid(Unit *u, pid_t *ret) {
1903 _cleanup_fclose_ FILE *f = NULL;
1904 pid_t pid = 0, npid, mypid;
1910 if (!u->cgroup_path)
1913 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1917 mypid = getpid_cached();
1918 while (cg_read_pid(f, &npid) > 0) {
1924 /* Ignore processes that aren't our kids */
1925 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1929 /* Dang, there's more than one daemonized PID
1930 in this group, so we don't know what process
1931 is the main process. */
1942 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1943 _cleanup_closedir_ DIR *d = NULL;
1944 _cleanup_fclose_ FILE *f = NULL;
1950 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1956 while ((r = cg_read_pid(f, &pid)) > 0) {
1957 r = unit_watch_pid(u, pid);
1958 if (r < 0 && ret >= 0)
1962 if (r < 0 && ret >= 0)
1966 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1973 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1974 _cleanup_free_ char *p = NULL;
1976 p = strjoin(path, "/", fn);
1982 r = unit_watch_pids_in_path(u, p);
1983 if (r < 0 && ret >= 0)
1987 if (r < 0 && ret >= 0)
1994 int unit_synthesize_cgroup_empty_event(Unit *u) {
1999 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
2000 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
2001 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2003 if (!u->cgroup_path)
2006 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2009 if (r > 0) /* On unified we have reliable notifications, and don't need this */
2012 if (!set_isempty(u->pids))
2015 unit_add_to_cgroup_empty_queue(u);
2019 int unit_watch_all_pids(Unit *u) {
2024 /* Adds all PIDs from our cgroup to the set of PIDs we
2025 * watch. This is a fallback logic for cases where we do not
2026 * get reliable cgroup empty notifications: we try to use
2027 * SIGCHLD as replacement. */
2029 if (!u->cgroup_path)
2032 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2035 if (r > 0) /* On unified we can use proper notifications */
2038 return unit_watch_pids_in_path(u, u->cgroup_path);
2041 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2042 Manager *m = userdata;
2049 u = m->cgroup_empty_queue;
2053 assert(u->in_cgroup_empty_queue);
2054 u->in_cgroup_empty_queue = false;
2055 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2057 if (m->cgroup_empty_queue) {
2058 /* More stuff queued, let's make sure we remain enabled */
2059 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2061 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2064 unit_add_to_gc_queue(u);
2066 if (UNIT_VTABLE(u)->notify_cgroup_empty)
2067 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2072 void unit_add_to_cgroup_empty_queue(Unit *u) {
2077 /* Note that there are four different ways how cgroup empty events reach us:
2079 * 1. On the unified hierarchy we get an inotify event on the cgroup
2081 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2083 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2085 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2086 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2088 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2089 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2090 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2091 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2092 * case for scope units). */
2094 if (u->in_cgroup_empty_queue)
2097 /* Let's verify that the cgroup is really empty */
2098 if (!u->cgroup_path)
2100 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2102 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2108 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2109 u->in_cgroup_empty_queue = true;
2111 /* Trigger the defer event */
2112 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2114 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2117 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2118 Manager *m = userdata;
2125 union inotify_event_buffer buffer;
2126 struct inotify_event *e;
2129 l = read(fd, &buffer, sizeof(buffer));
2131 if (IN_SET(errno, EINTR, EAGAIN))
2134 return log_error_errno(errno, "Failed to read control group inotify events: %m");
2137 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2141 /* Queue overflow has no watch descriptor */
2144 if (e->mask & IN_IGNORED)
2145 /* The watch was just removed */
2148 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2149 if (!u) /* Not that inotify might deliver
2150 * events for a watch even after it
2151 * was removed, because it was queued
2152 * before the removal. Let's ignore
2153 * this here safely. */
2156 unit_add_to_cgroup_empty_queue(u);
2162 int manager_setup_cgroup(Manager *m) {
2163 _cleanup_free_ char *path = NULL;
2164 const char *scope_path;
2167 #if 0 /// UNNEEDED by elogind
2173 /* 1. Determine hierarchy */
2174 m->cgroup_root = mfree(m->cgroup_root);
2175 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2176 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2178 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2181 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2183 #if 0 /// elogind does not support systemd scopes and slices
2184 /* Chop off the init scope, if we are already located in it */
2185 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2187 /* LEGACY: Also chop off the system slice if we are in
2188 * it. This is to support live upgrades from older systemd
2189 * versions where PID 1 was moved there. Also see
2190 * cg_get_root_path(). */
2191 if (!e && MANAGER_IS_SYSTEM(m)) {
2192 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2194 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2200 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2201 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2202 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2203 * easily prepend it everywhere. */
2204 delete_trailing_chars(m->cgroup_root, "/");
2207 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2209 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2211 r = cg_unified_flush();
2213 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2215 all_unified = cg_all_unified();
2216 if (all_unified < 0)
2217 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2218 if (all_unified > 0)
2219 log_debug("Unified cgroup hierarchy is located at %s.", path);
2221 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2223 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2225 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2227 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2230 #if 0 /// elogind is not init, and does not install the agent here.
2231 /* 3. Allocate cgroup empty defer event source */
2232 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2233 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2235 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2237 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2239 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2241 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2243 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2245 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2247 /* 4. Install notifier inotify object, or agent */
2248 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2250 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2252 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2253 safe_close(m->cgroup_inotify_fd);
2255 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2256 if (m->cgroup_inotify_fd < 0)
2257 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2259 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2261 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2263 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2264 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2265 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2267 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2269 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2271 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2273 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2274 * since it does not generate events when control groups with children run empty. */
2276 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2278 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2280 log_debug("Installed release agent.");
2282 log_debug("Release agent already installed.");
2285 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2286 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2287 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2289 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2290 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2292 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2295 * This method is in core, and normally called by systemd
2296 * being init. As elogind is never init, we can not install
2297 * our agent here. We do so when mounting our cgroup file
2298 * system, so only if elogind is its own tiny controller.
2299 * Further, elogind is not meant to run in systemd init scope. */
2300 if (MANAGER_IS_SYSTEM(m))
2301 // we are our own cgroup controller
2302 scope_path = strjoina("");
2303 else if (streq(m->cgroup_root, "/elogind"))
2304 // root already is our cgroup
2305 scope_path = strjoina(m->cgroup_root);
2307 // we have to create our own group
2308 scope_path = strjoina(m->cgroup_root, "/elogind");
2309 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2311 log_debug_elogind("Created control group \"%s\"", scope_path);
2313 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2315 /* 6. And pin it, so that it cannot be unmounted */
2316 safe_close(m->pin_cgroupfs_fd);
2317 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2318 if (m->pin_cgroupfs_fd < 0)
2319 return log_error_errno(errno, "Failed to open pin file: %m");
2321 } else if (r < 0 && !m->test_run_flags)
2322 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2324 /* 7. Always enable hierarchical support if it exists... */
2325 if (!all_unified && m->test_run_flags == 0)
2326 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2328 /* 8. Figure out which controllers are supported, and log about it */
2329 r = cg_mask_supported(&m->cgroup_supported);
2331 return log_error_errno(r, "Failed to determine supported controllers: %m");
2332 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2333 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2338 void manager_shutdown_cgroup(Manager *m, bool delete) {
2341 /* We can't really delete the group, since we are in it. But
2343 if (delete && m->cgroup_root)
2344 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2346 #if 0 /// elogind is not init
2347 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2349 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2351 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2352 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2355 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2357 m->cgroup_root = mfree(m->cgroup_root);
2360 #if 0 /// UNNEEDED by elogind
2361 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2368 u = hashmap_get(m->cgroup_unit, cgroup);
2372 p = strdupa(cgroup);
2376 e = strrchr(p, '/');
2378 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2382 u = hashmap_get(m->cgroup_unit, p);
2388 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2389 _cleanup_free_ char *cgroup = NULL;
2393 if (!pid_is_valid(pid))
2396 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2399 return manager_get_unit_by_cgroup(m, cgroup);
2402 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2407 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2408 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2409 * relevant one as children of the process will be assigned to that one, too, before all else. */
2411 if (!pid_is_valid(pid))
2414 if (pid == getpid_cached())
2415 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2417 u = manager_get_unit_by_pid_cgroup(m, pid);
2421 u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2425 array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2433 #if 0 /// elogind must substitute this with its own variant
2434 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2440 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2441 * or from the --system instance */
2443 log_debug("Got cgroup empty notification for: %s", cgroup);
2445 u = manager_get_unit_by_cgroup(m, cgroup);
2449 unit_add_to_cgroup_empty_queue(u);
2453 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2459 log_debug("Got cgroup empty notification for: %s", cgroup);
2461 s = hashmap_get(m->sessions, cgroup);
2464 session_finalize(s);
2467 log_warning("Session not found: %s", cgroup);
2472 #if 0 /// UNNEEDED by elogind
2473 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2474 _cleanup_free_ char *v = NULL;
2480 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2483 if (!u->cgroup_path)
2486 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2487 if (unit_has_root_cgroup(u))
2488 return procfs_memory_get_current(ret);
2490 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2493 r = cg_all_unified();
2497 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2499 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2505 return safe_atou64(v, ret);
2508 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2509 _cleanup_free_ char *v = NULL;
2515 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2518 if (!u->cgroup_path)
2521 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2522 if (unit_has_root_cgroup(u))
2523 return procfs_tasks_get_current(ret);
2525 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2528 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2534 return safe_atou64(v, ret);
2537 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2538 _cleanup_free_ char *v = NULL;
2545 if (!u->cgroup_path)
2548 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2549 if (unit_has_root_cgroup(u))
2550 return procfs_cpu_get_usage(ret);
2552 r = cg_all_unified();
2556 _cleanup_free_ char *val = NULL;
2559 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2562 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
2565 if (IN_SET(r, -ENOENT, -ENXIO))
2568 r = safe_atou64(val, &us);
2572 ns = us * NSEC_PER_USEC;
2574 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2577 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2583 r = safe_atou64(v, &ns);
2592 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2598 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2599 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2600 * call this function with a NULL return value. */
2602 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2605 r = unit_get_cpu_usage_raw(u, &ns);
2606 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2607 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2611 *ret = u->cpu_usage_last;
2617 if (ns > u->cpu_usage_base)
2618 ns -= u->cpu_usage_base;
2622 u->cpu_usage_last = ns;
2629 int unit_get_ip_accounting(
2631 CGroupIPAccountingMetric metric,
2638 assert(metric >= 0);
2639 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2642 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2645 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2646 u->ip_accounting_ingress_map_fd :
2647 u->ip_accounting_egress_map_fd;
2651 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2652 r = bpf_firewall_read_accounting(fd, &value, NULL);
2654 r = bpf_firewall_read_accounting(fd, NULL, &value);
2658 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2659 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2660 * ip_accounting_extra[] field, and add them in here transparently. */
2662 *ret = value + u->ip_accounting_extra[metric];
2667 int unit_reset_cpu_accounting(Unit *u) {
2673 u->cpu_usage_last = NSEC_INFINITY;
2675 r = unit_get_cpu_usage_raw(u, &ns);
2677 u->cpu_usage_base = 0;
2681 u->cpu_usage_base = ns;
2685 int unit_reset_ip_accounting(Unit *u) {
2690 if (u->ip_accounting_ingress_map_fd >= 0)
2691 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2693 if (u->ip_accounting_egress_map_fd >= 0)
2694 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2696 zero(u->ip_accounting_extra);
2698 return r < 0 ? r : q;
2701 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2704 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2710 /* always invalidate compat pairs together */
2711 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2712 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2714 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2715 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2717 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2720 u->cgroup_realized_mask &= ~m;
2721 unit_add_to_cgroup_realize_queue(u);
2724 void unit_invalidate_cgroup_bpf(Unit *u) {
2727 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2730 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2733 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2734 unit_add_to_cgroup_realize_queue(u);
2736 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2737 * list of our children includes our own. */
2738 if (u->type == UNIT_SLICE) {
2743 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2747 if (UNIT_DEREF(member->slice) != u)
2750 unit_invalidate_cgroup_bpf(member);
2755 bool unit_cgroup_delegate(Unit *u) {
2760 if (!UNIT_VTABLE(u)->can_delegate)
2763 c = unit_get_cgroup_context(u);
2770 void manager_invalidate_startup_units(Manager *m) {
2776 SET_FOREACH(u, m->startup_units, i)
2777 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2780 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2781 [CGROUP_AUTO] = "auto",
2782 [CGROUP_CLOSED] = "closed",
2783 [CGROUP_STRICT] = "strict",
2786 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);