1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 //#include "blockdev-util.h"
26 //#include "bpf-firewall.h"
27 //#include "bus-error.h"
28 #include "cgroup-util.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "process-util.h"
36 //#include "procfs-util.h"
37 //#include "special.h"
38 #include "stdio-util.h"
39 #include "string-table.h"
40 #include "string-util.h"
43 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
45 bool manager_owns_root_cgroup(Manager *m) {
48 /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
49 * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
50 * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
51 * we run in any kind of container virtualization. */
53 if (detect_container() > 0)
56 return isempty(m->cgroup_root) || path_equal(m->cgroup_root, "/");
59 #if 0 /// UNNEEDED by elogind
60 bool unit_has_root_cgroup(Unit *u) {
63 /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
64 * the manager manages the root cgroup. */
66 if (!manager_owns_root_cgroup(u->manager))
69 return unit_has_name(u, SPECIAL_ROOT_SLICE);
72 static void cgroup_compat_warn(void) {
73 static bool cgroup_compat_warned = false;
75 if (cgroup_compat_warned)
78 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
79 "See cgroup-compat debug messages for details.");
81 cgroup_compat_warned = true;
84 #define log_cgroup_compat(unit, fmt, ...) do { \
85 cgroup_compat_warn(); \
86 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
89 void cgroup_context_init(CGroupContext *c) {
92 /* Initialize everything to the kernel defaults, assuming the
93 * structure is preinitialized to 0 */
95 c->cpu_weight = CGROUP_WEIGHT_INVALID;
96 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
97 c->cpu_quota_per_sec_usec = USEC_INFINITY;
99 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
100 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
102 c->memory_high = CGROUP_LIMIT_MAX;
103 c->memory_max = CGROUP_LIMIT_MAX;
104 c->memory_swap_max = CGROUP_LIMIT_MAX;
106 c->memory_limit = CGROUP_LIMIT_MAX;
108 c->io_weight = CGROUP_WEIGHT_INVALID;
109 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
111 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
112 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
114 c->tasks_max = (uint64_t) -1;
117 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
121 LIST_REMOVE(device_allow, c->device_allow, a);
126 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
130 LIST_REMOVE(device_weights, c->io_device_weights, w);
135 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
139 LIST_REMOVE(device_limits, c->io_device_limits, l);
144 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
148 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
153 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
157 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
162 void cgroup_context_done(CGroupContext *c) {
165 while (c->io_device_weights)
166 cgroup_context_free_io_device_weight(c, c->io_device_weights);
168 while (c->io_device_limits)
169 cgroup_context_free_io_device_limit(c, c->io_device_limits);
171 while (c->blockio_device_weights)
172 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
174 while (c->blockio_device_bandwidths)
175 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
177 while (c->device_allow)
178 cgroup_context_free_device_allow(c, c->device_allow);
180 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
181 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
184 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
185 CGroupIODeviceLimit *il;
186 CGroupIODeviceWeight *iw;
187 CGroupBlockIODeviceBandwidth *b;
188 CGroupBlockIODeviceWeight *w;
189 CGroupDeviceAllow *a;
190 IPAddressAccessItem *iaai;
191 char u[FORMAT_TIMESPAN_MAX];
196 prefix = strempty(prefix);
199 "%sCPUAccounting=%s\n"
200 "%sIOAccounting=%s\n"
201 "%sBlockIOAccounting=%s\n"
202 "%sMemoryAccounting=%s\n"
203 "%sTasksAccounting=%s\n"
204 "%sIPAccounting=%s\n"
205 "%sCPUWeight=%" PRIu64 "\n"
206 "%sStartupCPUWeight=%" PRIu64 "\n"
207 "%sCPUShares=%" PRIu64 "\n"
208 "%sStartupCPUShares=%" PRIu64 "\n"
209 "%sCPUQuotaPerSecSec=%s\n"
210 "%sIOWeight=%" PRIu64 "\n"
211 "%sStartupIOWeight=%" PRIu64 "\n"
212 "%sBlockIOWeight=%" PRIu64 "\n"
213 "%sStartupBlockIOWeight=%" PRIu64 "\n"
214 "%sMemoryLow=%" PRIu64 "\n"
215 "%sMemoryHigh=%" PRIu64 "\n"
216 "%sMemoryMax=%" PRIu64 "\n"
217 "%sMemorySwapMax=%" PRIu64 "\n"
218 "%sMemoryLimit=%" PRIu64 "\n"
219 "%sTasksMax=%" PRIu64 "\n"
220 "%sDevicePolicy=%s\n"
222 prefix, yes_no(c->cpu_accounting),
223 prefix, yes_no(c->io_accounting),
224 prefix, yes_no(c->blockio_accounting),
225 prefix, yes_no(c->memory_accounting),
226 prefix, yes_no(c->tasks_accounting),
227 prefix, yes_no(c->ip_accounting),
228 prefix, c->cpu_weight,
229 prefix, c->startup_cpu_weight,
230 prefix, c->cpu_shares,
231 prefix, c->startup_cpu_shares,
232 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
233 prefix, c->io_weight,
234 prefix, c->startup_io_weight,
235 prefix, c->blockio_weight,
236 prefix, c->startup_blockio_weight,
237 prefix, c->memory_low,
238 prefix, c->memory_high,
239 prefix, c->memory_max,
240 prefix, c->memory_swap_max,
241 prefix, c->memory_limit,
242 prefix, c->tasks_max,
243 prefix, cgroup_device_policy_to_string(c->device_policy),
244 prefix, yes_no(c->delegate));
247 _cleanup_free_ char *t = NULL;
249 (void) cg_mask_to_string(c->delegate_controllers, &t);
251 fprintf(f, "%sDelegateControllers=%s\n",
256 LIST_FOREACH(device_allow, a, c->device_allow)
258 "%sDeviceAllow=%s %s%s%s\n",
261 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
263 LIST_FOREACH(device_weights, iw, c->io_device_weights)
265 "%sIODeviceWeight=%s %" PRIu64,
270 LIST_FOREACH(device_limits, il, c->io_device_limits) {
271 char buf[FORMAT_BYTES_MAX];
272 CGroupIOLimitType type;
274 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
275 if (il->limits[type] != cgroup_io_limit_defaults[type])
279 cgroup_io_limit_type_to_string(type),
281 format_bytes(buf, sizeof(buf), il->limits[type]));
284 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
286 "%sBlockIODeviceWeight=%s %" PRIu64,
291 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
292 char buf[FORMAT_BYTES_MAX];
294 if (b->rbps != CGROUP_LIMIT_MAX)
296 "%sBlockIOReadBandwidth=%s %s\n",
299 format_bytes(buf, sizeof(buf), b->rbps));
300 if (b->wbps != CGROUP_LIMIT_MAX)
302 "%sBlockIOWriteBandwidth=%s %s\n",
305 format_bytes(buf, sizeof(buf), b->wbps));
308 LIST_FOREACH(items, iaai, c->ip_address_allow) {
309 _cleanup_free_ char *k = NULL;
311 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
312 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
315 LIST_FOREACH(items, iaai, c->ip_address_deny) {
316 _cleanup_free_ char *k = NULL;
318 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
319 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
323 static int lookup_block_device(const char *p, dev_t *dev) {
332 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
334 if (S_ISBLK(st.st_mode))
336 else if (major(st.st_dev) != 0) {
337 /* If this is not a device node then find the block
338 * device this file is stored on */
341 /* If this is a partition, try to get the originating
343 (void) block_get_whole_disk(*dev, dev);
345 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
352 static int whitelist_device(const char *path, const char *node, const char *acc) {
353 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
355 bool ignore_notfound;
361 if (node[0] == '-') {
362 /* Non-existent paths starting with "-" must be silently ignored */
364 ignore_notfound = true;
366 ignore_notfound = false;
368 if (stat(node, &st) < 0) {
369 if (errno == ENOENT && ignore_notfound)
372 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
375 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
376 log_warning("%s is not a device.", node);
382 S_ISCHR(st.st_mode) ? 'c' : 'b',
383 major(st.st_rdev), minor(st.st_rdev),
386 r = cg_set_attribute("devices", path, "devices.allow", buf);
388 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
389 "Failed to set devices.allow on %s: %m", path);
394 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
395 _cleanup_fclose_ FILE *f = NULL;
402 assert(IN_SET(type, 'b', 'c'));
404 f = fopen("/proc/devices", "re");
406 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
408 FOREACH_LINE(line, f, goto fail) {
409 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
414 if (type == 'c' && streq(line, "Character devices:")) {
419 if (type == 'b' && streq(line, "Block devices:")) {
434 w = strpbrk(p, WHITESPACE);
439 r = safe_atou(p, &maj);
446 w += strspn(w, WHITESPACE);
448 if (fnmatch(name, w, 0) != 0)
457 r = cg_set_attribute("devices", path, "devices.allow", buf);
459 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
460 "Failed to set devices.allow on %s: %m", path);
466 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
469 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
470 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
471 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
474 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
475 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
476 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
479 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
480 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
481 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
482 return c->startup_cpu_weight;
483 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
484 return c->cpu_weight;
486 return CGROUP_WEIGHT_DEFAULT;
489 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
490 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
491 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
492 return c->startup_cpu_shares;
493 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
494 return c->cpu_shares;
496 return CGROUP_CPU_SHARES_DEFAULT;
499 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
500 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
503 xsprintf(buf, "%" PRIu64 "\n", weight);
504 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
506 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
507 "Failed to set cpu.weight: %m");
509 if (quota != USEC_INFINITY)
510 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
511 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
513 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
515 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
518 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
519 "Failed to set cpu.max: %m");
522 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
523 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
526 xsprintf(buf, "%" PRIu64 "\n", shares);
527 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
529 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
530 "Failed to set cpu.shares: %m");
532 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
533 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
535 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
536 "Failed to set cpu.cfs_period_us: %m");
538 if (quota != USEC_INFINITY) {
539 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
540 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
542 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
544 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
545 "Failed to set cpu.cfs_quota_us: %m");
548 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
549 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
550 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
553 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
554 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
555 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
558 static bool cgroup_context_has_io_config(CGroupContext *c) {
559 return c->io_accounting ||
560 c->io_weight != CGROUP_WEIGHT_INVALID ||
561 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
562 c->io_device_weights ||
566 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
567 return c->blockio_accounting ||
568 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
569 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
570 c->blockio_device_weights ||
571 c->blockio_device_bandwidths;
574 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
575 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
576 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
577 return c->startup_io_weight;
578 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
581 return CGROUP_WEIGHT_DEFAULT;
584 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
585 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
586 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
587 return c->startup_blockio_weight;
588 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
589 return c->blockio_weight;
591 return CGROUP_BLKIO_WEIGHT_DEFAULT;
594 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
595 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
596 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
599 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
600 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
601 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
604 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
605 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
609 r = lookup_block_device(dev_path, &dev);
613 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
614 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
616 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
617 "Failed to set io.weight: %m");
620 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
621 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
625 r = lookup_block_device(dev_path, &dev);
629 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
630 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
632 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
633 "Failed to set blkio.weight_device: %m");
636 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
637 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
638 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
639 CGroupIOLimitType type;
644 r = lookup_block_device(dev_path, &dev);
648 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
649 if (limits[type] != cgroup_io_limit_defaults[type]) {
650 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
653 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
657 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
658 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
659 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
660 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
662 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
663 "Failed to set io.max: %m");
667 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
668 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
673 r = lookup_block_device(dev_path, &dev);
677 if (rbps != CGROUP_LIMIT_MAX)
679 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
680 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
682 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
683 "Failed to set blkio.throttle.read_bps_device: %m");
685 if (wbps != CGROUP_LIMIT_MAX)
687 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
688 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
690 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
691 "Failed to set blkio.throttle.write_bps_device: %m");
696 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
697 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
700 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
701 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
704 if (v != CGROUP_LIMIT_MAX)
705 xsprintf(buf, "%" PRIu64 "\n", v);
707 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
709 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
710 "Failed to set %s: %m", file);
713 static void cgroup_apply_firewall(Unit *u) {
716 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
718 if (bpf_firewall_compile(u) < 0)
721 (void) bpf_firewall_install(u);
724 static void cgroup_context_apply(
726 CGroupMask apply_mask,
728 ManagerState state) {
737 /* Nothing to do? Exit early! */
738 if (apply_mask == 0 && !apply_bpf)
741 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
742 is_root = unit_has_root_cgroup(u);
744 assert_se(c = unit_get_cgroup_context(u));
745 assert_se(path = u->cgroup_path);
747 if (is_root) /* Make sure we don't try to display messages with an empty path. */
750 /* We generally ignore errors caused by read-only mounted
751 * cgroup trees (assuming we are running in a container then),
752 * and missing cgroups, i.e. EROFS and ENOENT. */
754 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
755 bool has_weight, has_shares;
757 has_weight = cgroup_context_has_cpu_weight(c);
758 has_shares = cgroup_context_has_cpu_shares(c);
760 if (cg_all_unified() > 0) {
764 weight = cgroup_context_cpu_weight(c, state);
765 else if (has_shares) {
766 uint64_t shares = cgroup_context_cpu_shares(c, state);
768 weight = cgroup_cpu_shares_to_weight(shares);
770 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
771 shares, weight, path);
773 weight = CGROUP_WEIGHT_DEFAULT;
775 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
780 uint64_t weight = cgroup_context_cpu_weight(c, state);
782 shares = cgroup_cpu_weight_to_shares(weight);
784 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
785 weight, shares, path);
786 } else if (has_shares)
787 shares = cgroup_context_cpu_shares(c, state);
789 shares = CGROUP_CPU_SHARES_DEFAULT;
791 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
795 if (apply_mask & CGROUP_MASK_IO) {
796 bool has_io = cgroup_context_has_io_config(c);
797 bool has_blockio = cgroup_context_has_blockio_config(c);
800 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
804 weight = cgroup_context_io_weight(c, state);
805 else if (has_blockio) {
806 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
808 weight = cgroup_weight_blkio_to_io(blkio_weight);
810 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
811 blkio_weight, weight);
813 weight = CGROUP_WEIGHT_DEFAULT;
815 xsprintf(buf, "default %" PRIu64 "\n", weight);
816 r = cg_set_attribute("io", path, "io.weight", buf);
818 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
819 "Failed to set io.weight: %m");
822 CGroupIODeviceWeight *w;
824 /* FIXME: no way to reset this list */
825 LIST_FOREACH(device_weights, w, c->io_device_weights)
826 cgroup_apply_io_device_weight(u, w->path, w->weight);
827 } else if (has_blockio) {
828 CGroupBlockIODeviceWeight *w;
830 /* FIXME: no way to reset this list */
831 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
832 weight = cgroup_weight_blkio_to_io(w->weight);
834 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
835 w->weight, weight, w->path);
837 cgroup_apply_io_device_weight(u, w->path, weight);
842 /* Apply limits and free ones without config. */
844 CGroupIODeviceLimit *l, *next;
846 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
847 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
848 cgroup_context_free_io_device_limit(c, l);
850 } else if (has_blockio) {
851 CGroupBlockIODeviceBandwidth *b, *next;
853 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
854 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
855 CGroupIOLimitType type;
857 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
858 limits[type] = cgroup_io_limit_defaults[type];
860 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
861 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
863 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
864 b->rbps, b->wbps, b->path);
866 if (!cgroup_apply_io_device_limit(u, b->path, limits))
867 cgroup_context_free_blockio_device_bandwidth(c, b);
872 if (apply_mask & CGROUP_MASK_BLKIO) {
873 bool has_io = cgroup_context_has_io_config(c);
874 bool has_blockio = cgroup_context_has_blockio_config(c);
877 char buf[DECIMAL_STR_MAX(uint64_t)+1];
881 uint64_t io_weight = cgroup_context_io_weight(c, state);
883 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
885 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
887 } else if (has_blockio)
888 weight = cgroup_context_blkio_weight(c, state);
890 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
892 xsprintf(buf, "%" PRIu64 "\n", weight);
893 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
895 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
896 "Failed to set blkio.weight: %m");
899 CGroupIODeviceWeight *w;
901 /* FIXME: no way to reset this list */
902 LIST_FOREACH(device_weights, w, c->io_device_weights) {
903 weight = cgroup_weight_io_to_blkio(w->weight);
905 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
906 w->weight, weight, w->path);
908 cgroup_apply_blkio_device_weight(u, w->path, weight);
910 } else if (has_blockio) {
911 CGroupBlockIODeviceWeight *w;
913 /* FIXME: no way to reset this list */
914 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
915 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
919 /* Apply limits and free ones without config. */
921 CGroupIODeviceLimit *l, *next;
923 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
924 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
925 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
927 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
928 cgroup_context_free_io_device_limit(c, l);
930 } else if (has_blockio) {
931 CGroupBlockIODeviceBandwidth *b, *next;
933 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
934 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
935 cgroup_context_free_blockio_device_bandwidth(c, b);
939 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
940 if (cg_all_unified() > 0) {
941 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
943 if (cgroup_context_has_unified_memory_config(c)) {
945 swap_max = c->memory_swap_max;
947 max = c->memory_limit;
949 if (max != CGROUP_LIMIT_MAX)
950 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
953 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
954 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
955 cgroup_apply_unified_memory_limit(u, "memory.max", max);
956 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
958 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
961 if (cgroup_context_has_unified_memory_config(c)) {
963 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
965 val = c->memory_limit;
967 if (val == CGROUP_LIMIT_MAX)
968 strncpy(buf, "-1\n", sizeof(buf));
970 xsprintf(buf, "%" PRIu64 "\n", val);
972 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
974 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
975 "Failed to set memory.limit_in_bytes: %m");
979 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
980 CGroupDeviceAllow *a;
982 /* Changing the devices list of a populated cgroup
983 * might result in EINVAL, hence ignore EINVAL
986 if (c->device_allow || c->device_policy != CGROUP_AUTO)
987 r = cg_set_attribute("devices", path, "devices.deny", "a");
989 r = cg_set_attribute("devices", path, "devices.allow", "a");
991 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
992 "Failed to reset devices.list: %m");
994 if (c->device_policy == CGROUP_CLOSED ||
995 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
996 static const char auto_devices[] =
997 "/dev/null\0" "rwm\0"
998 "/dev/zero\0" "rwm\0"
999 "/dev/full\0" "rwm\0"
1000 "/dev/random\0" "rwm\0"
1001 "/dev/urandom\0" "rwm\0"
1002 "/dev/tty\0" "rwm\0"
1003 "/dev/ptmx\0" "rwm\0"
1004 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1005 "-/run/systemd/inaccessible/chr\0" "rwm\0"
1006 "-/run/systemd/inaccessible/blk\0" "rwm\0";
1010 NULSTR_FOREACH_PAIR(x, y, auto_devices)
1011 whitelist_device(path, x, y);
1013 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1014 whitelist_major(path, "pts", 'c', "rw");
1017 LIST_FOREACH(device_allow, a, c->device_allow) {
1033 if (path_startswith(a->path, "/dev/"))
1034 whitelist_device(path, a->path, acc);
1035 else if ((val = startswith(a->path, "block-")))
1036 whitelist_major(path, val, 'b', acc);
1037 else if ((val = startswith(a->path, "char-")))
1038 whitelist_major(path, val, 'c', acc);
1040 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1044 if (apply_mask & CGROUP_MASK_PIDS) {
1047 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1048 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1049 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1050 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1051 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1052 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1053 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1054 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1055 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1056 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1057 * which is desirable so that there's an offical way to release control of the sysctl from
1058 * systemd: set the limit to unbounded and reload. */
1060 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1061 u->manager->sysctl_pid_max_changed = true;
1062 r = procfs_tasks_set_limit(c->tasks_max);
1063 } else if (u->manager->sysctl_pid_max_changed)
1064 r = procfs_tasks_set_limit(TASKS_MAX);
1069 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1070 "Failed to write to tasks limit sysctls: %m");
1073 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1074 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1076 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1077 r = cg_set_attribute("pids", path, "pids.max", buf);
1079 r = cg_set_attribute("pids", path, "pids.max", "max");
1081 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1082 "Failed to set pids.max: %m");
1087 cgroup_apply_firewall(u);
1090 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1091 CGroupMask mask = 0;
1093 /* Figure out which controllers we need */
1095 if (c->cpu_accounting ||
1096 cgroup_context_has_cpu_weight(c) ||
1097 cgroup_context_has_cpu_shares(c) ||
1098 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1099 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1101 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1102 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1104 if (c->memory_accounting ||
1105 c->memory_limit != CGROUP_LIMIT_MAX ||
1106 cgroup_context_has_unified_memory_config(c))
1107 mask |= CGROUP_MASK_MEMORY;
1109 if (c->device_allow ||
1110 c->device_policy != CGROUP_AUTO)
1111 mask |= CGROUP_MASK_DEVICES;
1113 if (c->tasks_accounting ||
1114 c->tasks_max != CGROUP_LIMIT_MAX)
1115 mask |= CGROUP_MASK_PIDS;
1120 CGroupMask unit_get_own_mask(Unit *u) {
1123 /* Returns the mask of controllers the unit needs for itself */
1125 c = unit_get_cgroup_context(u);
1129 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1132 CGroupMask unit_get_delegate_mask(Unit *u) {
1135 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1136 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1138 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1140 if (!unit_cgroup_delegate(u))
1143 if (cg_all_unified() <= 0) {
1146 e = unit_get_exec_context(u);
1147 if (e && !exec_context_maintains_privileges(e))
1151 assert_se(c = unit_get_cgroup_context(u));
1152 return c->delegate_controllers;
1155 CGroupMask unit_get_members_mask(Unit *u) {
1158 /* Returns the mask of controllers all of the unit's children require, merged */
1160 if (u->cgroup_members_mask_valid)
1161 return u->cgroup_members_mask;
1163 u->cgroup_members_mask = 0;
1165 if (u->type == UNIT_SLICE) {
1170 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1175 if (UNIT_DEREF(member->slice) != u)
1178 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1182 u->cgroup_members_mask_valid = true;
1183 return u->cgroup_members_mask;
1186 CGroupMask unit_get_siblings_mask(Unit *u) {
1189 /* Returns the mask of controllers all of the unit's siblings
1190 * require, i.e. the members mask of the unit's parent slice
1191 * if there is one. */
1193 if (UNIT_ISSET(u->slice))
1194 return unit_get_members_mask(UNIT_DEREF(u->slice));
1196 return unit_get_subtree_mask(u); /* we are the top-level slice */
1199 CGroupMask unit_get_subtree_mask(Unit *u) {
1201 /* Returns the mask of this subtree, meaning of the group
1202 * itself and its children. */
1204 return unit_get_own_mask(u) | unit_get_members_mask(u);
1207 CGroupMask unit_get_target_mask(Unit *u) {
1210 /* This returns the cgroup mask of all controllers to enable
1211 * for a specific cgroup, i.e. everything it needs itself,
1212 * plus all that its children need, plus all that its siblings
1213 * need. This is primarily useful on the legacy cgroup
1214 * hierarchy, where we need to duplicate each cgroup in each
1215 * hierarchy that shall be enabled for it. */
1217 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1218 mask &= u->manager->cgroup_supported;
1223 CGroupMask unit_get_enable_mask(Unit *u) {
1226 /* This returns the cgroup mask of all controllers to enable
1227 * for the children of a specific cgroup. This is primarily
1228 * useful for the unified cgroup hierarchy, where each cgroup
1229 * controls which controllers are enabled for its children. */
1231 mask = unit_get_members_mask(u);
1232 mask &= u->manager->cgroup_supported;
1237 bool unit_get_needs_bpf(Unit *u) {
1242 c = unit_get_cgroup_context(u);
1246 if (c->ip_accounting ||
1247 c->ip_address_allow ||
1251 /* If any parent slice has an IP access list defined, it applies too */
1252 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1253 c = unit_get_cgroup_context(p);
1257 if (c->ip_address_allow ||
1265 /* Recurse from a unit up through its containing slices, propagating
1266 * mask bits upward. A unit is also member of itself. */
1267 void unit_update_cgroup_members_masks(Unit *u) {
1273 /* Calculate subtree mask */
1274 m = unit_get_subtree_mask(u);
1276 /* See if anything changed from the previous invocation. If
1277 * not, we're done. */
1278 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1282 u->cgroup_subtree_mask_valid &&
1283 ((m & ~u->cgroup_subtree_mask) != 0) &&
1284 ((~m & u->cgroup_subtree_mask) == 0);
1286 u->cgroup_subtree_mask = m;
1287 u->cgroup_subtree_mask_valid = true;
1289 if (UNIT_ISSET(u->slice)) {
1290 Unit *s = UNIT_DEREF(u->slice);
1293 /* There's more set now than before. We
1294 * propagate the new mask to the parent's mask
1295 * (not caring if it actually was valid or
1298 s->cgroup_members_mask |= m;
1301 /* There's less set now than before (or we
1302 * don't know), we need to recalculate
1303 * everything, so let's invalidate the
1304 * parent's members mask */
1306 s->cgroup_members_mask_valid = false;
1308 /* And now make sure that this change also hits our
1310 unit_update_cgroup_members_masks(s);
1314 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1316 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1320 if (u->cgroup_path &&
1321 u->cgroup_realized &&
1322 (u->cgroup_realized_mask & mask) == mask)
1323 return u->cgroup_path;
1325 u = UNIT_DEREF(u->slice);
1331 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1332 return unit_get_realized_cgroup_path(userdata, mask);
1335 char *unit_default_cgroup_path(Unit *u) {
1336 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1341 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1342 return strdup(u->manager->cgroup_root);
1344 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1345 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1350 escaped = cg_escape(u->id);
1355 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1358 return strjoin(u->manager->cgroup_root, "/", escaped);
1361 int unit_set_cgroup_path(Unit *u, const char *path) {
1362 _cleanup_free_ char *p = NULL;
1374 if (streq_ptr(u->cgroup_path, p))
1378 r = hashmap_put(u->manager->cgroup_unit, p, u);
1383 unit_release_cgroup(u);
1385 u->cgroup_path = TAKE_PTR(p);
1390 int unit_watch_cgroup(Unit *u) {
1391 _cleanup_free_ char *events = NULL;
1396 if (!u->cgroup_path)
1399 if (u->cgroup_inotify_wd >= 0)
1402 /* Only applies to the unified hierarchy */
1403 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1405 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1409 /* Don't watch the root slice, it's pointless. */
1410 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1413 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1417 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1421 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1422 if (u->cgroup_inotify_wd < 0) {
1424 if (errno == ENOENT) /* If the directory is already
1425 * gone we don't need to track
1426 * it, so this is not an error */
1429 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1432 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1434 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1439 int unit_pick_cgroup_path(Unit *u) {
1440 _cleanup_free_ char *path = NULL;
1448 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1451 path = unit_default_cgroup_path(u);
1455 r = unit_set_cgroup_path(u, path);
1457 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1459 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1464 static int unit_create_cgroup(
1466 CGroupMask target_mask,
1467 CGroupMask enable_mask,
1475 c = unit_get_cgroup_context(u);
1479 /* Figure out our cgroup path */
1480 r = unit_pick_cgroup_path(u);
1484 /* First, create our own group */
1485 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1487 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1489 /* Start watching it */
1490 (void) unit_watch_cgroup(u);
1492 /* Enable all controllers we need */
1493 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1495 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1497 /* Keep track that this is now realized */
1498 u->cgroup_realized = true;
1499 u->cgroup_realized_mask = target_mask;
1500 u->cgroup_enabled_mask = enable_mask;
1501 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1503 if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1505 /* Then, possibly move things over, but not if
1506 * subgroups may contain processes, which is the case
1507 * for slice and delegation units. */
1508 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1510 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1516 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1517 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1523 if (MANAGER_IS_SYSTEM(u->manager))
1526 if (!u->manager->system_bus)
1529 if (!u->cgroup_path)
1532 /* Determine this unit's cgroup path relative to our cgroup root */
1533 pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1537 pp = strjoina("/", pp, suffix_path);
1538 path_kill_slashes(pp);
1540 r = sd_bus_call_method(u->manager->system_bus,
1541 "org.freedesktop.systemd1",
1542 "/org/freedesktop/systemd1",
1543 "org.freedesktop.systemd1.Manager",
1544 "AttachProcessesToUnit",
1547 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1549 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1554 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1555 CGroupMask delegated_mask;
1563 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1566 if (set_isempty(pids))
1569 r = unit_realize_cgroup(u);
1573 if (isempty(suffix_path))
1576 p = strjoina(u->cgroup_path, "/", suffix_path);
1578 delegated_mask = unit_get_delegate_mask(u);
1581 SET_FOREACH(pidp, pids, i) {
1582 pid_t pid = PTR_TO_PID(pidp);
1585 /* First, attach the PID to the main cgroup hierarchy */
1586 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1588 log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1590 if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1593 /* If we are in a user instance, and we can't move the process ourselves due to
1594 * permission problems, let's ask the system instance about it instead. Since it's more
1595 * privileged it might be able to move the process across the leaves of a subtree who's
1596 * top node is not owned by us. */
1598 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1600 log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1602 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1606 r = q; /* Remember first error */
1611 q = cg_all_unified();
1617 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1618 * innermost realized one */
1620 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1621 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1622 const char *realized;
1624 if (!(u->manager->cgroup_supported & bit))
1627 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1628 if (delegated_mask & u->cgroup_realized_mask & bit) {
1629 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1631 continue; /* Success! */
1633 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1634 pid, p, cgroup_controller_to_string(c));
1637 /* So this controller is either not delegate or realized, or something else weird happened. In
1638 * that case let's attach the PID at least to the closest cgroup up the tree that is
1640 realized = unit_get_realized_cgroup_path(u, bit);
1642 continue; /* Not even realized in the root slice? Then let's not bother */
1644 q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1646 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1647 pid, realized, cgroup_controller_to_string(c));
1654 static void cgroup_xattr_apply(Unit *u) {
1655 char ids[SD_ID128_STRING_MAX];
1660 if (!MANAGER_IS_SYSTEM(u->manager))
1663 if (sd_id128_is_null(u->invocation_id))
1666 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1667 "trusted.invocation_id",
1668 sd_id128_to_string(u->invocation_id, ids), 32,
1671 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1674 static bool unit_has_mask_realized(
1676 CGroupMask target_mask,
1677 CGroupMask enable_mask,
1682 return u->cgroup_realized &&
1683 u->cgroup_realized_mask == target_mask &&
1684 u->cgroup_enabled_mask == enable_mask &&
1685 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1686 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1689 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1692 if (u->in_cgroup_realize_queue)
1695 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1696 u->in_cgroup_realize_queue = true;
1699 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1702 if (!u->in_cgroup_realize_queue)
1705 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1706 u->in_cgroup_realize_queue = false;
1710 /* Check if necessary controllers and attributes for a unit are in place.
1712 * If so, do nothing.
1713 * If not, create paths, move processes over, and set attributes.
1715 * Returns 0 on success and < 0 on failure. */
1716 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1717 CGroupMask target_mask, enable_mask;
1718 bool needs_bpf, apply_bpf;
1723 unit_remove_from_cgroup_realize_queue(u);
1725 target_mask = unit_get_target_mask(u);
1726 enable_mask = unit_get_enable_mask(u);
1727 needs_bpf = unit_get_needs_bpf(u);
1729 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1732 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1733 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1734 * this will trickle down properly to cgroupfs. */
1735 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1737 /* First, realize parents */
1738 if (UNIT_ISSET(u->slice)) {
1739 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1744 /* And then do the real work */
1745 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1749 /* Finally, apply the necessary attributes. */
1750 cgroup_context_apply(u, target_mask, apply_bpf, state);
1751 cgroup_xattr_apply(u);
1756 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1764 state = manager_state(m);
1766 while ((i = m->cgroup_realize_queue)) {
1767 assert(i->in_cgroup_realize_queue);
1769 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1770 /* Maybe things changed, and the unit is not actually active anymore? */
1771 unit_remove_from_cgroup_realize_queue(i);
1775 r = unit_realize_cgroup_now(i, state);
1777 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1785 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1788 /* This adds the siblings of the specified unit and the
1789 * siblings of all parent units to the cgroup queue. (But
1790 * neither the specified unit itself nor the parents.) */
1792 while ((slice = UNIT_DEREF(u->slice))) {
1797 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1801 /* Skip units that have a dependency on the slice
1802 * but aren't actually in it. */
1803 if (UNIT_DEREF(m->slice) != slice)
1806 /* No point in doing cgroup application for units
1807 * without active processes. */
1808 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1811 /* If the unit doesn't need any new controllers
1812 * and has current ones realized, it doesn't need
1814 if (unit_has_mask_realized(m,
1815 unit_get_target_mask(m),
1816 unit_get_enable_mask(m),
1817 unit_get_needs_bpf(m)))
1820 unit_add_to_cgroup_realize_queue(m);
1827 int unit_realize_cgroup(Unit *u) {
1830 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1833 /* So, here's the deal: when realizing the cgroups for this
1834 * unit, we need to first create all parents, but there's more
1835 * actually: for the weight-based controllers we also need to
1836 * make sure that all our siblings (i.e. units that are in the
1837 * same slice as we are) have cgroups, too. Otherwise, things
1838 * would become very uneven as each of their processes would
1839 * get as much resources as all our group together. This call
1840 * will synchronously create the parent cgroups, but will
1841 * defer work on the siblings to the next event loop
1844 /* Add all sibling slices to the cgroup queue. */
1845 unit_add_siblings_to_cgroup_realize_queue(u);
1847 /* And realize this one now (and apply the values) */
1848 return unit_realize_cgroup_now(u, manager_state(u->manager));
1851 void unit_release_cgroup(Unit *u) {
1854 /* Forgets all cgroup details for this cgroup */
1856 if (u->cgroup_path) {
1857 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1858 u->cgroup_path = mfree(u->cgroup_path);
1861 if (u->cgroup_inotify_wd >= 0) {
1862 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1863 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1865 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1866 u->cgroup_inotify_wd = -1;
1870 void unit_prune_cgroup(Unit *u) {
1876 /* Removes the cgroup, if empty and possible, and stops watching it. */
1878 if (!u->cgroup_path)
1881 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1883 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1885 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1887 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1894 unit_release_cgroup(u);
1896 u->cgroup_realized = false;
1897 u->cgroup_realized_mask = 0;
1898 u->cgroup_enabled_mask = 0;
1901 int unit_search_main_pid(Unit *u, pid_t *ret) {
1902 _cleanup_fclose_ FILE *f = NULL;
1903 pid_t pid = 0, npid, mypid;
1909 if (!u->cgroup_path)
1912 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1916 mypid = getpid_cached();
1917 while (cg_read_pid(f, &npid) > 0) {
1923 /* Ignore processes that aren't our kids */
1924 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1928 /* Dang, there's more than one daemonized PID
1929 in this group, so we don't know what process
1930 is the main process. */
1941 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1942 _cleanup_closedir_ DIR *d = NULL;
1943 _cleanup_fclose_ FILE *f = NULL;
1949 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1955 while ((r = cg_read_pid(f, &pid)) > 0) {
1956 r = unit_watch_pid(u, pid);
1957 if (r < 0 && ret >= 0)
1961 if (r < 0 && ret >= 0)
1965 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1972 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1973 _cleanup_free_ char *p = NULL;
1975 p = strjoin(path, "/", fn);
1981 r = unit_watch_pids_in_path(u, p);
1982 if (r < 0 && ret >= 0)
1986 if (r < 0 && ret >= 0)
1993 int unit_synthesize_cgroup_empty_event(Unit *u) {
1998 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1999 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
2000 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2002 if (!u->cgroup_path)
2005 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2008 if (r > 0) /* On unified we have reliable notifications, and don't need this */
2011 if (!set_isempty(u->pids))
2014 unit_add_to_cgroup_empty_queue(u);
2018 int unit_watch_all_pids(Unit *u) {
2023 /* Adds all PIDs from our cgroup to the set of PIDs we
2024 * watch. This is a fallback logic for cases where we do not
2025 * get reliable cgroup empty notifications: we try to use
2026 * SIGCHLD as replacement. */
2028 if (!u->cgroup_path)
2031 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2034 if (r > 0) /* On unified we can use proper notifications */
2037 return unit_watch_pids_in_path(u, u->cgroup_path);
2040 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2041 Manager *m = userdata;
2048 u = m->cgroup_empty_queue;
2052 assert(u->in_cgroup_empty_queue);
2053 u->in_cgroup_empty_queue = false;
2054 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2056 if (m->cgroup_empty_queue) {
2057 /* More stuff queued, let's make sure we remain enabled */
2058 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2060 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2063 unit_add_to_gc_queue(u);
2065 if (UNIT_VTABLE(u)->notify_cgroup_empty)
2066 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2071 void unit_add_to_cgroup_empty_queue(Unit *u) {
2076 /* Note that there are four different ways how cgroup empty events reach us:
2078 * 1. On the unified hierarchy we get an inotify event on the cgroup
2080 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2082 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2084 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2085 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2087 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2088 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2089 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2090 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2091 * case for scope units). */
2093 if (u->in_cgroup_empty_queue)
2096 /* Let's verify that the cgroup is really empty */
2097 if (!u->cgroup_path)
2099 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2101 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2107 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2108 u->in_cgroup_empty_queue = true;
2110 /* Trigger the defer event */
2111 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2113 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2116 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2117 Manager *m = userdata;
2124 union inotify_event_buffer buffer;
2125 struct inotify_event *e;
2128 l = read(fd, &buffer, sizeof(buffer));
2130 if (IN_SET(errno, EINTR, EAGAIN))
2133 return log_error_errno(errno, "Failed to read control group inotify events: %m");
2136 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2140 /* Queue overflow has no watch descriptor */
2143 if (e->mask & IN_IGNORED)
2144 /* The watch was just removed */
2147 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2148 if (!u) /* Not that inotify might deliver
2149 * events for a watch even after it
2150 * was removed, because it was queued
2151 * before the removal. Let's ignore
2152 * this here safely. */
2155 unit_add_to_cgroup_empty_queue(u);
2161 int manager_setup_cgroup(Manager *m) {
2162 _cleanup_free_ char *path = NULL;
2163 const char *scope_path;
2166 #if 0 /// UNNEEDED by elogind
2172 /* 1. Determine hierarchy */
2173 m->cgroup_root = mfree(m->cgroup_root);
2174 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2175 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2177 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2180 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2182 #if 0 /// elogind does not support systemd scopes and slices
2183 /* Chop off the init scope, if we are already located in it */
2184 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2186 /* LEGACY: Also chop off the system slice if we are in
2187 * it. This is to support live upgrades from older systemd
2188 * versions where PID 1 was moved there. Also see
2189 * cg_get_root_path(). */
2190 if (!e && MANAGER_IS_SYSTEM(m)) {
2191 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2193 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2199 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2200 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2201 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2202 * easily prepend it everywhere. */
2203 delete_trailing_chars(m->cgroup_root, "/");
2206 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2208 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2210 r = cg_unified_flush();
2212 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2214 all_unified = cg_all_unified();
2215 if (all_unified < 0)
2216 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2217 if (all_unified > 0)
2218 log_debug("Unified cgroup hierarchy is located at %s.", path);
2220 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2222 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2224 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2226 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2229 #if 0 /// elogind is not init, and does not install the agent here.
2230 /* 3. Allocate cgroup empty defer event source */
2231 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2232 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2234 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2236 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2238 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2240 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2242 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2244 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2246 /* 4. Install notifier inotify object, or agent */
2247 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2249 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2251 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2252 safe_close(m->cgroup_inotify_fd);
2254 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2255 if (m->cgroup_inotify_fd < 0)
2256 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2258 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2260 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2262 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2263 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2264 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2266 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2268 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2270 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2272 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2273 * since it does not generate events when control groups with children run empty. */
2275 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2277 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2279 log_debug("Installed release agent.");
2281 log_debug("Release agent already installed.");
2284 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2285 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2286 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2288 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2289 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2291 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2294 * This method is in core, and normally called by systemd
2295 * being init. As elogind is never init, we can not install
2296 * our agent here. We do so when mounting our cgroup file
2297 * system, so only if elogind is its own tiny controller.
2298 * Further, elogind is not meant to run in systemd init scope. */
2299 if (MANAGER_IS_SYSTEM(m))
2300 // we are our own cgroup controller
2301 scope_path = strjoina("");
2302 else if (streq(m->cgroup_root, "/elogind"))
2303 // root already is our cgroup
2304 scope_path = strjoina(m->cgroup_root);
2306 // we have to create our own group
2307 scope_path = strjoina(m->cgroup_root, "/elogind");
2308 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2310 log_debug_elogind("Created control group \"%s\"", scope_path);
2312 /* 6. And pin it, so that it cannot be unmounted */
2313 safe_close(m->pin_cgroupfs_fd);
2314 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2315 if (m->pin_cgroupfs_fd < 0)
2316 return log_error_errno(errno, "Failed to open pin file: %m");
2318 #if 0 /// this is from the cgroup migration above that elogind does not need.
2319 } else if (r < 0 && !m->test_run_flags)
2320 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2323 /* 7. Always enable hierarchical support if it exists... */
2324 if (!all_unified && m->test_run_flags == 0)
2325 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2327 /* 8. Figure out which controllers are supported, and log about it */
2328 r = cg_mask_supported(&m->cgroup_supported);
2330 return log_error_errno(r, "Failed to determine supported controllers: %m");
2331 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2332 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2337 void manager_shutdown_cgroup(Manager *m, bool delete) {
2340 #if 0 /// elogind is not init
2341 /* We can't really delete the group, since we are in it. But
2343 if (delete && m->cgroup_root)
2344 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2346 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2348 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2350 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2351 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2354 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2356 m->cgroup_root = mfree(m->cgroup_root);
2359 #if 0 /// UNNEEDED by elogind
2360 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2367 u = hashmap_get(m->cgroup_unit, cgroup);
2371 p = strdupa(cgroup);
2375 e = strrchr(p, '/');
2377 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2381 u = hashmap_get(m->cgroup_unit, p);
2387 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2388 _cleanup_free_ char *cgroup = NULL;
2392 if (!pid_is_valid(pid))
2395 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2398 return manager_get_unit_by_cgroup(m, cgroup);
2401 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2406 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2407 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2408 * relevant one as children of the process will be assigned to that one, too, before all else. */
2410 if (!pid_is_valid(pid))
2413 if (pid == getpid_cached())
2414 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2416 u = manager_get_unit_by_pid_cgroup(m, pid);
2420 u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2424 array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2432 #if 0 /// elogind must substitute this with its own variant
2433 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2439 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2440 * or from the --system instance */
2442 log_debug("Got cgroup empty notification for: %s", cgroup);
2444 u = manager_get_unit_by_cgroup(m, cgroup);
2448 unit_add_to_cgroup_empty_queue(u);
2452 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2458 log_debug("Got cgroup empty notification for: %s", cgroup);
2460 s = hashmap_get(m->sessions, cgroup);
2463 session_finalize(s);
2466 log_warning("Session not found: %s", cgroup);
2471 #if 0 /// UNNEEDED by elogind
2472 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2473 _cleanup_free_ char *v = NULL;
2479 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2482 if (!u->cgroup_path)
2485 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2486 if (unit_has_root_cgroup(u))
2487 return procfs_memory_get_current(ret);
2489 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2492 r = cg_all_unified();
2496 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2498 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2504 return safe_atou64(v, ret);
2507 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2508 _cleanup_free_ char *v = NULL;
2514 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2517 if (!u->cgroup_path)
2520 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2521 if (unit_has_root_cgroup(u))
2522 return procfs_tasks_get_current(ret);
2524 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2527 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2533 return safe_atou64(v, ret);
2536 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2537 _cleanup_free_ char *v = NULL;
2544 if (!u->cgroup_path)
2547 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2548 if (unit_has_root_cgroup(u))
2549 return procfs_cpu_get_usage(ret);
2551 r = cg_all_unified();
2555 _cleanup_free_ char *val = NULL;
2558 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2561 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
2564 if (IN_SET(r, -ENOENT, -ENXIO))
2567 r = safe_atou64(val, &us);
2571 ns = us * NSEC_PER_USEC;
2573 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2576 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2582 r = safe_atou64(v, &ns);
2591 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2597 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2598 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2599 * call this function with a NULL return value. */
2601 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2604 r = unit_get_cpu_usage_raw(u, &ns);
2605 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2606 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2610 *ret = u->cpu_usage_last;
2616 if (ns > u->cpu_usage_base)
2617 ns -= u->cpu_usage_base;
2621 u->cpu_usage_last = ns;
2628 int unit_get_ip_accounting(
2630 CGroupIPAccountingMetric metric,
2637 assert(metric >= 0);
2638 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2641 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2644 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2645 u->ip_accounting_ingress_map_fd :
2646 u->ip_accounting_egress_map_fd;
2650 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2651 r = bpf_firewall_read_accounting(fd, &value, NULL);
2653 r = bpf_firewall_read_accounting(fd, NULL, &value);
2657 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2658 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2659 * ip_accounting_extra[] field, and add them in here transparently. */
2661 *ret = value + u->ip_accounting_extra[metric];
2666 int unit_reset_cpu_accounting(Unit *u) {
2672 u->cpu_usage_last = NSEC_INFINITY;
2674 r = unit_get_cpu_usage_raw(u, &ns);
2676 u->cpu_usage_base = 0;
2680 u->cpu_usage_base = ns;
2684 int unit_reset_ip_accounting(Unit *u) {
2689 if (u->ip_accounting_ingress_map_fd >= 0)
2690 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2692 if (u->ip_accounting_egress_map_fd >= 0)
2693 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2695 zero(u->ip_accounting_extra);
2697 return r < 0 ? r : q;
2700 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2703 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2709 /* always invalidate compat pairs together */
2710 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2711 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2713 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2714 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2716 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2719 u->cgroup_realized_mask &= ~m;
2720 unit_add_to_cgroup_realize_queue(u);
2723 void unit_invalidate_cgroup_bpf(Unit *u) {
2726 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2729 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2732 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2733 unit_add_to_cgroup_realize_queue(u);
2735 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2736 * list of our children includes our own. */
2737 if (u->type == UNIT_SLICE) {
2742 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2746 if (UNIT_DEREF(member->slice) != u)
2749 unit_invalidate_cgroup_bpf(member);
2754 bool unit_cgroup_delegate(Unit *u) {
2759 if (!UNIT_VTABLE(u)->can_delegate)
2762 c = unit_get_cgroup_context(u);
2769 void manager_invalidate_startup_units(Manager *m) {
2775 SET_FOREACH(u, m->startup_units, i)
2776 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2779 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2780 [CGROUP_AUTO] = "auto",
2781 [CGROUP_CLOSED] = "closed",
2782 [CGROUP_STRICT] = "strict",
2785 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);