1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "alloc-util.h"
25 //#include "blockdev-util.h"
26 //#include "bpf-firewall.h"
27 //#include "bus-error.h"
28 #include "cgroup-util.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "process-util.h"
36 //#include "procfs-util.h"
37 //#include "special.h"
38 #include "stdio-util.h"
39 #include "string-table.h"
40 #include "string-util.h"
42 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
44 bool unit_has_root_cgroup(Unit *u) {
47 /* Returns whether this unit manages the root cgroup. Note that this is different from being named "-.slice",
48 * as inside of containers the root slice won't be identical to the root cgroup. */
53 return isempty(u->cgroup_path) || path_equal(u->cgroup_path, "/");
56 #if 0 /// UNNEEDED by elogind
57 static void cgroup_compat_warn(void) {
58 static bool cgroup_compat_warned = false;
60 if (cgroup_compat_warned)
63 log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. See cgroup-compat debug messages for details.");
64 cgroup_compat_warned = true;
67 #define log_cgroup_compat(unit, fmt, ...) do { \
68 cgroup_compat_warn(); \
69 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
72 void cgroup_context_init(CGroupContext *c) {
75 /* Initialize everything to the kernel defaults, assuming the
76 * structure is preinitialized to 0 */
78 c->cpu_weight = CGROUP_WEIGHT_INVALID;
79 c->startup_cpu_weight = CGROUP_WEIGHT_INVALID;
80 c->cpu_quota_per_sec_usec = USEC_INFINITY;
82 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
83 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
85 c->memory_high = CGROUP_LIMIT_MAX;
86 c->memory_max = CGROUP_LIMIT_MAX;
87 c->memory_swap_max = CGROUP_LIMIT_MAX;
89 c->memory_limit = CGROUP_LIMIT_MAX;
91 c->io_weight = CGROUP_WEIGHT_INVALID;
92 c->startup_io_weight = CGROUP_WEIGHT_INVALID;
94 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
95 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
97 c->tasks_max = (uint64_t) -1;
100 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
104 LIST_REMOVE(device_allow, c->device_allow, a);
109 void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
113 LIST_REMOVE(device_weights, c->io_device_weights, w);
118 void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
122 LIST_REMOVE(device_limits, c->io_device_limits, l);
127 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
131 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
136 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
140 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
145 void cgroup_context_done(CGroupContext *c) {
148 while (c->io_device_weights)
149 cgroup_context_free_io_device_weight(c, c->io_device_weights);
151 while (c->io_device_limits)
152 cgroup_context_free_io_device_limit(c, c->io_device_limits);
154 while (c->blockio_device_weights)
155 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
157 while (c->blockio_device_bandwidths)
158 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
160 while (c->device_allow)
161 cgroup_context_free_device_allow(c, c->device_allow);
163 c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
164 c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
167 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
168 CGroupIODeviceLimit *il;
169 CGroupIODeviceWeight *iw;
170 CGroupBlockIODeviceBandwidth *b;
171 CGroupBlockIODeviceWeight *w;
172 CGroupDeviceAllow *a;
173 IPAddressAccessItem *iaai;
174 char u[FORMAT_TIMESPAN_MAX];
179 prefix = strempty(prefix);
182 "%sCPUAccounting=%s\n"
183 "%sIOAccounting=%s\n"
184 "%sBlockIOAccounting=%s\n"
185 "%sMemoryAccounting=%s\n"
186 "%sTasksAccounting=%s\n"
187 "%sIPAccounting=%s\n"
188 "%sCPUWeight=%" PRIu64 "\n"
189 "%sStartupCPUWeight=%" PRIu64 "\n"
190 "%sCPUShares=%" PRIu64 "\n"
191 "%sStartupCPUShares=%" PRIu64 "\n"
192 "%sCPUQuotaPerSecSec=%s\n"
193 "%sIOWeight=%" PRIu64 "\n"
194 "%sStartupIOWeight=%" PRIu64 "\n"
195 "%sBlockIOWeight=%" PRIu64 "\n"
196 "%sStartupBlockIOWeight=%" PRIu64 "\n"
197 "%sMemoryLow=%" PRIu64 "\n"
198 "%sMemoryHigh=%" PRIu64 "\n"
199 "%sMemoryMax=%" PRIu64 "\n"
200 "%sMemorySwapMax=%" PRIu64 "\n"
201 "%sMemoryLimit=%" PRIu64 "\n"
202 "%sTasksMax=%" PRIu64 "\n"
203 "%sDevicePolicy=%s\n"
205 prefix, yes_no(c->cpu_accounting),
206 prefix, yes_no(c->io_accounting),
207 prefix, yes_no(c->blockio_accounting),
208 prefix, yes_no(c->memory_accounting),
209 prefix, yes_no(c->tasks_accounting),
210 prefix, yes_no(c->ip_accounting),
211 prefix, c->cpu_weight,
212 prefix, c->startup_cpu_weight,
213 prefix, c->cpu_shares,
214 prefix, c->startup_cpu_shares,
215 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
216 prefix, c->io_weight,
217 prefix, c->startup_io_weight,
218 prefix, c->blockio_weight,
219 prefix, c->startup_blockio_weight,
220 prefix, c->memory_low,
221 prefix, c->memory_high,
222 prefix, c->memory_max,
223 prefix, c->memory_swap_max,
224 prefix, c->memory_limit,
225 prefix, c->tasks_max,
226 prefix, cgroup_device_policy_to_string(c->device_policy),
227 prefix, yes_no(c->delegate));
230 _cleanup_free_ char *t = NULL;
232 (void) cg_mask_to_string(c->delegate_controllers, &t);
234 fprintf(f, "%sDelegateControllers=%s\n",
239 LIST_FOREACH(device_allow, a, c->device_allow)
241 "%sDeviceAllow=%s %s%s%s\n",
244 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
246 LIST_FOREACH(device_weights, iw, c->io_device_weights)
248 "%sIODeviceWeight=%s %" PRIu64,
253 LIST_FOREACH(device_limits, il, c->io_device_limits) {
254 char buf[FORMAT_BYTES_MAX];
255 CGroupIOLimitType type;
257 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
258 if (il->limits[type] != cgroup_io_limit_defaults[type])
262 cgroup_io_limit_type_to_string(type),
264 format_bytes(buf, sizeof(buf), il->limits[type]));
267 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
269 "%sBlockIODeviceWeight=%s %" PRIu64,
274 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
275 char buf[FORMAT_BYTES_MAX];
277 if (b->rbps != CGROUP_LIMIT_MAX)
279 "%sBlockIOReadBandwidth=%s %s\n",
282 format_bytes(buf, sizeof(buf), b->rbps));
283 if (b->wbps != CGROUP_LIMIT_MAX)
285 "%sBlockIOWriteBandwidth=%s %s\n",
288 format_bytes(buf, sizeof(buf), b->wbps));
291 LIST_FOREACH(items, iaai, c->ip_address_allow) {
292 _cleanup_free_ char *k = NULL;
294 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
295 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
298 LIST_FOREACH(items, iaai, c->ip_address_deny) {
299 _cleanup_free_ char *k = NULL;
301 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
302 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
306 static int lookup_block_device(const char *p, dev_t *dev) {
315 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
317 if (S_ISBLK(st.st_mode))
319 else if (major(st.st_dev) != 0) {
320 /* If this is not a device node then find the block
321 * device this file is stored on */
324 /* If this is a partition, try to get the originating
326 (void) block_get_whole_disk(*dev, dev);
328 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
335 static int whitelist_device(const char *path, const char *node, const char *acc) {
336 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
338 bool ignore_notfound;
344 if (node[0] == '-') {
345 /* Non-existent paths starting with "-" must be silently ignored */
347 ignore_notfound = true;
349 ignore_notfound = false;
351 if (stat(node, &st) < 0) {
352 if (errno == ENOENT && ignore_notfound)
355 return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
358 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
359 log_warning("%s is not a device.", node);
365 S_ISCHR(st.st_mode) ? 'c' : 'b',
366 major(st.st_rdev), minor(st.st_rdev),
369 r = cg_set_attribute("devices", path, "devices.allow", buf);
371 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
372 "Failed to set devices.allow on %s: %m", path);
377 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
378 _cleanup_fclose_ FILE *f = NULL;
385 assert(IN_SET(type, 'b', 'c'));
387 f = fopen("/proc/devices", "re");
389 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
391 FOREACH_LINE(line, f, goto fail) {
392 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
397 if (type == 'c' && streq(line, "Character devices:")) {
402 if (type == 'b' && streq(line, "Block devices:")) {
417 w = strpbrk(p, WHITESPACE);
422 r = safe_atou(p, &maj);
429 w += strspn(w, WHITESPACE);
431 if (fnmatch(name, w, 0) != 0)
440 r = cg_set_attribute("devices", path, "devices.allow", buf);
442 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
443 "Failed to set devices.allow on %s: %m", path);
449 return log_warning_errno(errno, "Failed to read /proc/devices: %m");
452 static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
453 return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
454 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
457 static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
458 return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
459 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
462 static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
463 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
464 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
465 return c->startup_cpu_weight;
466 else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
467 return c->cpu_weight;
469 return CGROUP_WEIGHT_DEFAULT;
472 static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
473 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
474 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
475 return c->startup_cpu_shares;
476 else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
477 return c->cpu_shares;
479 return CGROUP_CPU_SHARES_DEFAULT;
482 static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) {
483 char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)];
486 xsprintf(buf, "%" PRIu64 "\n", weight);
487 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf);
489 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
490 "Failed to set cpu.weight: %m");
492 if (quota != USEC_INFINITY)
493 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
494 quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
496 xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
498 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf);
501 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
502 "Failed to set cpu.max: %m");
505 static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) {
506 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
509 xsprintf(buf, "%" PRIu64 "\n", shares);
510 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf);
512 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
513 "Failed to set cpu.shares: %m");
515 xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
516 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf);
518 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
519 "Failed to set cpu.cfs_period_us: %m");
521 if (quota != USEC_INFINITY) {
522 xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
523 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf);
525 r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1");
527 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
528 "Failed to set cpu.cfs_quota_us: %m");
531 static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
532 return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
533 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
536 static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
537 return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
538 CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
541 static bool cgroup_context_has_io_config(CGroupContext *c) {
542 return c->io_accounting ||
543 c->io_weight != CGROUP_WEIGHT_INVALID ||
544 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
545 c->io_device_weights ||
549 static bool cgroup_context_has_blockio_config(CGroupContext *c) {
550 return c->blockio_accounting ||
551 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
552 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
553 c->blockio_device_weights ||
554 c->blockio_device_bandwidths;
557 static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
558 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
559 c->startup_io_weight != CGROUP_WEIGHT_INVALID)
560 return c->startup_io_weight;
561 else if (c->io_weight != CGROUP_WEIGHT_INVALID)
564 return CGROUP_WEIGHT_DEFAULT;
567 static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
568 if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
569 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
570 return c->startup_blockio_weight;
571 else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
572 return c->blockio_weight;
574 return CGROUP_BLKIO_WEIGHT_DEFAULT;
577 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
578 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
579 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
582 static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
583 return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
584 CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
587 static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
588 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
592 r = lookup_block_device(dev_path, &dev);
596 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
597 r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
599 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
600 "Failed to set io.weight: %m");
603 static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
604 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
608 r = lookup_block_device(dev_path, &dev);
612 xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
613 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf);
615 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
616 "Failed to set blkio.weight_device: %m");
619 static unsigned cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
620 char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
621 char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
622 CGroupIOLimitType type;
627 r = lookup_block_device(dev_path, &dev);
631 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
632 if (limits[type] != cgroup_io_limit_defaults[type]) {
633 xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
636 xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
640 xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
641 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
642 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
643 r = cg_set_attribute("io", u->cgroup_path, "io.max", buf);
645 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
646 "Failed to set io.max: %m");
650 static unsigned cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
651 char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
656 r = lookup_block_device(dev_path, &dev);
660 if (rbps != CGROUP_LIMIT_MAX)
662 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
663 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf);
665 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
666 "Failed to set blkio.throttle.read_bps_device: %m");
668 if (wbps != CGROUP_LIMIT_MAX)
670 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
671 r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf);
673 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
674 "Failed to set blkio.throttle.write_bps_device: %m");
679 static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
680 return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX;
683 static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
684 char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
687 if (v != CGROUP_LIMIT_MAX)
688 xsprintf(buf, "%" PRIu64 "\n", v);
690 r = cg_set_attribute("memory", u->cgroup_path, file, buf);
692 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
693 "Failed to set %s: %m", file);
696 static void cgroup_apply_firewall(Unit *u) {
699 /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
701 if (bpf_firewall_compile(u) < 0)
704 (void) bpf_firewall_install(u);
707 static void cgroup_context_apply(
709 CGroupMask apply_mask,
711 ManagerState state) {
720 /* Nothing to do? Exit early! */
721 if (apply_mask == 0 && !apply_bpf)
724 /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
725 is_root = unit_has_root_cgroup(u);
727 assert_se(c = unit_get_cgroup_context(u));
728 assert_se(path = u->cgroup_path);
730 if (is_root) /* Make sure we don't try to display messages with an empty path. */
733 /* We generally ignore errors caused by read-only mounted
734 * cgroup trees (assuming we are running in a container then),
735 * and missing cgroups, i.e. EROFS and ENOENT. */
737 if ((apply_mask & CGROUP_MASK_CPU) && !is_root) {
738 bool has_weight, has_shares;
740 has_weight = cgroup_context_has_cpu_weight(c);
741 has_shares = cgroup_context_has_cpu_shares(c);
743 if (cg_all_unified() > 0) {
747 weight = cgroup_context_cpu_weight(c, state);
748 else if (has_shares) {
749 uint64_t shares = cgroup_context_cpu_shares(c, state);
751 weight = cgroup_cpu_shares_to_weight(shares);
753 log_cgroup_compat(u, "Applying [Startup]CpuShares %" PRIu64 " as [Startup]CpuWeight %" PRIu64 " on %s",
754 shares, weight, path);
756 weight = CGROUP_WEIGHT_DEFAULT;
758 cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec);
763 uint64_t weight = cgroup_context_cpu_weight(c, state);
765 shares = cgroup_cpu_weight_to_shares(weight);
767 log_cgroup_compat(u, "Applying [Startup]CpuWeight %" PRIu64 " as [Startup]CpuShares %" PRIu64 " on %s",
768 weight, shares, path);
769 } else if (has_shares)
770 shares = cgroup_context_cpu_shares(c, state);
772 shares = CGROUP_CPU_SHARES_DEFAULT;
774 cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec);
778 if (apply_mask & CGROUP_MASK_IO) {
779 bool has_io = cgroup_context_has_io_config(c);
780 bool has_blockio = cgroup_context_has_blockio_config(c);
783 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
787 weight = cgroup_context_io_weight(c, state);
788 else if (has_blockio) {
789 uint64_t blkio_weight = cgroup_context_blkio_weight(c, state);
791 weight = cgroup_weight_blkio_to_io(blkio_weight);
793 log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64,
794 blkio_weight, weight);
796 weight = CGROUP_WEIGHT_DEFAULT;
798 xsprintf(buf, "default %" PRIu64 "\n", weight);
799 r = cg_set_attribute("io", path, "io.weight", buf);
801 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
802 "Failed to set io.weight: %m");
805 CGroupIODeviceWeight *w;
807 /* FIXME: no way to reset this list */
808 LIST_FOREACH(device_weights, w, c->io_device_weights)
809 cgroup_apply_io_device_weight(u, w->path, w->weight);
810 } else if (has_blockio) {
811 CGroupBlockIODeviceWeight *w;
813 /* FIXME: no way to reset this list */
814 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
815 weight = cgroup_weight_blkio_to_io(w->weight);
817 log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s",
818 w->weight, weight, w->path);
820 cgroup_apply_io_device_weight(u, w->path, weight);
825 /* Apply limits and free ones without config. */
827 CGroupIODeviceLimit *l, *next;
829 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
830 if (!cgroup_apply_io_device_limit(u, l->path, l->limits))
831 cgroup_context_free_io_device_limit(c, l);
833 } else if (has_blockio) {
834 CGroupBlockIODeviceBandwidth *b, *next;
836 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths) {
837 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
838 CGroupIOLimitType type;
840 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
841 limits[type] = cgroup_io_limit_defaults[type];
843 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
844 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
846 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax for %s",
847 b->rbps, b->wbps, b->path);
849 if (!cgroup_apply_io_device_limit(u, b->path, limits))
850 cgroup_context_free_blockio_device_bandwidth(c, b);
855 if (apply_mask & CGROUP_MASK_BLKIO) {
856 bool has_io = cgroup_context_has_io_config(c);
857 bool has_blockio = cgroup_context_has_blockio_config(c);
860 char buf[DECIMAL_STR_MAX(uint64_t)+1];
864 uint64_t io_weight = cgroup_context_io_weight(c, state);
866 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
868 log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64,
870 } else if (has_blockio)
871 weight = cgroup_context_blkio_weight(c, state);
873 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
875 xsprintf(buf, "%" PRIu64 "\n", weight);
876 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
878 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
879 "Failed to set blkio.weight: %m");
882 CGroupIODeviceWeight *w;
884 /* FIXME: no way to reset this list */
885 LIST_FOREACH(device_weights, w, c->io_device_weights) {
886 weight = cgroup_weight_io_to_blkio(w->weight);
888 log_cgroup_compat(u, "Applying IODeviceWeight %" PRIu64 " as BlockIODeviceWeight %" PRIu64 " for %s",
889 w->weight, weight, w->path);
891 cgroup_apply_blkio_device_weight(u, w->path, weight);
893 } else if (has_blockio) {
894 CGroupBlockIODeviceWeight *w;
896 /* FIXME: no way to reset this list */
897 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
898 cgroup_apply_blkio_device_weight(u, w->path, w->weight);
902 /* Apply limits and free ones without config. */
904 CGroupIODeviceLimit *l, *next;
906 LIST_FOREACH_SAFE(device_limits, l, next, c->io_device_limits) {
907 log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s",
908 l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
910 if (!cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]))
911 cgroup_context_free_io_device_limit(c, l);
913 } else if (has_blockio) {
914 CGroupBlockIODeviceBandwidth *b, *next;
916 LIST_FOREACH_SAFE(device_bandwidths, b, next, c->blockio_device_bandwidths)
917 if (!cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps))
918 cgroup_context_free_blockio_device_bandwidth(c, b);
922 if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) {
923 if (cg_all_unified() > 0) {
924 uint64_t max, swap_max = CGROUP_LIMIT_MAX;
926 if (cgroup_context_has_unified_memory_config(c)) {
928 swap_max = c->memory_swap_max;
930 max = c->memory_limit;
932 if (max != CGROUP_LIMIT_MAX)
933 log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max);
936 cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low);
937 cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
938 cgroup_apply_unified_memory_limit(u, "memory.max", max);
939 cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
941 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
944 if (cgroup_context_has_unified_memory_config(c)) {
946 log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val);
948 val = c->memory_limit;
950 if (val == CGROUP_LIMIT_MAX)
951 strncpy(buf, "-1\n", sizeof(buf));
953 xsprintf(buf, "%" PRIu64 "\n", val);
955 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
957 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
958 "Failed to set memory.limit_in_bytes: %m");
962 if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
963 CGroupDeviceAllow *a;
965 /* Changing the devices list of a populated cgroup
966 * might result in EINVAL, hence ignore EINVAL
969 if (c->device_allow || c->device_policy != CGROUP_AUTO)
970 r = cg_set_attribute("devices", path, "devices.deny", "a");
972 r = cg_set_attribute("devices", path, "devices.allow", "a");
974 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
975 "Failed to reset devices.list: %m");
977 if (c->device_policy == CGROUP_CLOSED ||
978 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
979 static const char auto_devices[] =
980 "/dev/null\0" "rwm\0"
981 "/dev/zero\0" "rwm\0"
982 "/dev/full\0" "rwm\0"
983 "/dev/random\0" "rwm\0"
984 "/dev/urandom\0" "rwm\0"
986 "/dev/ptmx\0" "rwm\0"
987 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
988 "-/run/systemd/inaccessible/chr\0" "rwm\0"
989 "-/run/systemd/inaccessible/blk\0" "rwm\0";
993 NULSTR_FOREACH_PAIR(x, y, auto_devices)
994 whitelist_device(path, x, y);
996 /* PTS (/dev/pts) devices may not be duplicated, but accessed */
997 whitelist_major(path, "pts", 'c', "rw");
1000 LIST_FOREACH(device_allow, a, c->device_allow) {
1016 if (path_startswith(a->path, "/dev/"))
1017 whitelist_device(path, a->path, acc);
1018 else if ((val = startswith(a->path, "block-")))
1019 whitelist_major(path, val, 'b', acc);
1020 else if ((val = startswith(a->path, "char-")))
1021 whitelist_major(path, val, 'c', acc);
1023 log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
1027 if (apply_mask & CGROUP_MASK_PIDS) {
1030 /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1031 * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1032 * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1033 * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1034 * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1035 * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1036 * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1037 * it also counts. But if the user never set a limit through us (i.e. we are the default of
1038 * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1039 * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1040 * which is desirable so that there's an offical way to release control of the sysctl from
1041 * systemd: set the limit to unbounded and reload. */
1043 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1044 u->manager->sysctl_pid_max_changed = true;
1045 r = procfs_tasks_set_limit(c->tasks_max);
1046 } else if (u->manager->sysctl_pid_max_changed)
1047 r = procfs_tasks_set_limit(TASKS_MAX);
1052 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1053 "Failed to write to tasks limit sysctls: %m");
1056 if (c->tasks_max != CGROUP_LIMIT_MAX) {
1057 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1059 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1060 r = cg_set_attribute("pids", path, "pids.max", buf);
1062 r = cg_set_attribute("pids", path, "pids.max", "max");
1064 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1065 "Failed to set pids.max: %m");
1070 cgroup_apply_firewall(u);
1073 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
1074 CGroupMask mask = 0;
1076 /* Figure out which controllers we need */
1078 if (c->cpu_accounting ||
1079 cgroup_context_has_cpu_weight(c) ||
1080 cgroup_context_has_cpu_shares(c) ||
1081 c->cpu_quota_per_sec_usec != USEC_INFINITY)
1082 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
1084 if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1085 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1087 if (c->memory_accounting ||
1088 c->memory_limit != CGROUP_LIMIT_MAX ||
1089 cgroup_context_has_unified_memory_config(c))
1090 mask |= CGROUP_MASK_MEMORY;
1092 if (c->device_allow ||
1093 c->device_policy != CGROUP_AUTO)
1094 mask |= CGROUP_MASK_DEVICES;
1096 if (c->tasks_accounting ||
1097 c->tasks_max != CGROUP_LIMIT_MAX)
1098 mask |= CGROUP_MASK_PIDS;
1103 CGroupMask unit_get_own_mask(Unit *u) {
1106 /* Returns the mask of controllers the unit needs for itself */
1108 c = unit_get_cgroup_context(u);
1112 return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
1115 CGroupMask unit_get_delegate_mask(Unit *u) {
1118 /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1119 * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1121 * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1123 if (!unit_cgroup_delegate(u))
1126 if (cg_all_unified() <= 0) {
1129 e = unit_get_exec_context(u);
1130 if (e && !exec_context_maintains_privileges(e))
1134 assert_se(c = unit_get_cgroup_context(u));
1135 return c->delegate_controllers;
1138 CGroupMask unit_get_members_mask(Unit *u) {
1141 /* Returns the mask of controllers all of the unit's children require, merged */
1143 if (u->cgroup_members_mask_valid)
1144 return u->cgroup_members_mask;
1146 u->cgroup_members_mask = 0;
1148 if (u->type == UNIT_SLICE) {
1153 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1158 if (UNIT_DEREF(member->slice) != u)
1161 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1165 u->cgroup_members_mask_valid = true;
1166 return u->cgroup_members_mask;
1169 CGroupMask unit_get_siblings_mask(Unit *u) {
1172 /* Returns the mask of controllers all of the unit's siblings
1173 * require, i.e. the members mask of the unit's parent slice
1174 * if there is one. */
1176 if (UNIT_ISSET(u->slice))
1177 return unit_get_members_mask(UNIT_DEREF(u->slice));
1179 return unit_get_subtree_mask(u); /* we are the top-level slice */
1182 CGroupMask unit_get_subtree_mask(Unit *u) {
1184 /* Returns the mask of this subtree, meaning of the group
1185 * itself and its children. */
1187 return unit_get_own_mask(u) | unit_get_members_mask(u);
1190 CGroupMask unit_get_target_mask(Unit *u) {
1193 /* This returns the cgroup mask of all controllers to enable
1194 * for a specific cgroup, i.e. everything it needs itself,
1195 * plus all that its children need, plus all that its siblings
1196 * need. This is primarily useful on the legacy cgroup
1197 * hierarchy, where we need to duplicate each cgroup in each
1198 * hierarchy that shall be enabled for it. */
1200 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1201 mask &= u->manager->cgroup_supported;
1206 CGroupMask unit_get_enable_mask(Unit *u) {
1209 /* This returns the cgroup mask of all controllers to enable
1210 * for the children of a specific cgroup. This is primarily
1211 * useful for the unified cgroup hierarchy, where each cgroup
1212 * controls which controllers are enabled for its children. */
1214 mask = unit_get_members_mask(u);
1215 mask &= u->manager->cgroup_supported;
1220 bool unit_get_needs_bpf(Unit *u) {
1225 c = unit_get_cgroup_context(u);
1229 if (c->ip_accounting ||
1230 c->ip_address_allow ||
1234 /* If any parent slice has an IP access list defined, it applies too */
1235 for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1236 c = unit_get_cgroup_context(p);
1240 if (c->ip_address_allow ||
1248 /* Recurse from a unit up through its containing slices, propagating
1249 * mask bits upward. A unit is also member of itself. */
1250 void unit_update_cgroup_members_masks(Unit *u) {
1256 /* Calculate subtree mask */
1257 m = unit_get_subtree_mask(u);
1259 /* See if anything changed from the previous invocation. If
1260 * not, we're done. */
1261 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
1265 u->cgroup_subtree_mask_valid &&
1266 ((m & ~u->cgroup_subtree_mask) != 0) &&
1267 ((~m & u->cgroup_subtree_mask) == 0);
1269 u->cgroup_subtree_mask = m;
1270 u->cgroup_subtree_mask_valid = true;
1272 if (UNIT_ISSET(u->slice)) {
1273 Unit *s = UNIT_DEREF(u->slice);
1276 /* There's more set now than before. We
1277 * propagate the new mask to the parent's mask
1278 * (not caring if it actually was valid or
1281 s->cgroup_members_mask |= m;
1284 /* There's less set now than before (or we
1285 * don't know), we need to recalculate
1286 * everything, so let's invalidate the
1287 * parent's members mask */
1289 s->cgroup_members_mask_valid = false;
1291 /* And now make sure that this change also hits our
1293 unit_update_cgroup_members_masks(s);
1297 const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1299 /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1303 if (u->cgroup_path &&
1304 u->cgroup_realized &&
1305 (u->cgroup_realized_mask & mask) == mask)
1306 return u->cgroup_path;
1308 u = UNIT_DEREF(u->slice);
1314 static const char *migrate_callback(CGroupMask mask, void *userdata) {
1315 return unit_get_realized_cgroup_path(userdata, mask);
1318 char *unit_default_cgroup_path(Unit *u) {
1319 _cleanup_free_ char *escaped = NULL, *slice = NULL;
1324 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1325 return strdup(u->manager->cgroup_root);
1327 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1328 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1333 escaped = cg_escape(u->id);
1338 return strjoin(u->manager->cgroup_root, "/", slice, "/",
1341 return strjoin(u->manager->cgroup_root, "/", escaped);
1344 int unit_set_cgroup_path(Unit *u, const char *path) {
1345 _cleanup_free_ char *p = NULL;
1357 if (streq_ptr(u->cgroup_path, p))
1361 r = hashmap_put(u->manager->cgroup_unit, p, u);
1366 unit_release_cgroup(u);
1374 int unit_watch_cgroup(Unit *u) {
1375 _cleanup_free_ char *events = NULL;
1380 if (!u->cgroup_path)
1383 if (u->cgroup_inotify_wd >= 0)
1386 /* Only applies to the unified hierarchy */
1387 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1389 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1393 /* Don't watch the root slice, it's pointless. */
1394 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1397 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
1401 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1405 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1406 if (u->cgroup_inotify_wd < 0) {
1408 if (errno == ENOENT) /* If the directory is already
1409 * gone we don't need to track
1410 * it, so this is not an error */
1413 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
1416 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
1418 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
1423 int unit_pick_cgroup_path(Unit *u) {
1424 _cleanup_free_ char *path = NULL;
1432 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1435 path = unit_default_cgroup_path(u);
1439 r = unit_set_cgroup_path(u, path);
1441 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1443 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1448 static int unit_create_cgroup(
1450 CGroupMask target_mask,
1451 CGroupMask enable_mask,
1459 c = unit_get_cgroup_context(u);
1463 /* Figure out our cgroup path */
1464 r = unit_pick_cgroup_path(u);
1468 /* First, create our own group */
1469 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1471 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1473 /* Start watching it */
1474 (void) unit_watch_cgroup(u);
1476 /* Enable all controllers we need */
1477 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
1479 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1481 /* Keep track that this is now realized */
1482 u->cgroup_realized = true;
1483 u->cgroup_realized_mask = target_mask;
1484 u->cgroup_enabled_mask = enable_mask;
1485 u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
1487 if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1489 /* Then, possibly move things over, but not if
1490 * subgroups may contain processes, which is the case
1491 * for slice and delegation units. */
1492 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1494 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1500 static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1501 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1507 if (MANAGER_IS_SYSTEM(u->manager))
1510 if (!u->manager->system_bus)
1513 if (!u->cgroup_path)
1516 /* Determine this unit's cgroup path relative to our cgroup root */
1517 pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1521 pp = strjoina("/", pp, suffix_path);
1522 path_kill_slashes(pp);
1524 r = sd_bus_call_method(u->manager->system_bus,
1525 "org.freedesktop.systemd1",
1526 "/org/freedesktop/systemd1",
1527 "org.freedesktop.systemd1.Manager",
1528 "AttachProcessesToUnit",
1531 NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1533 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1538 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1539 CGroupMask delegated_mask;
1547 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1550 if (set_isempty(pids))
1553 r = unit_realize_cgroup(u);
1557 if (isempty(suffix_path))
1560 p = strjoina(u->cgroup_path, "/", suffix_path);
1562 delegated_mask = unit_get_delegate_mask(u);
1565 SET_FOREACH(pidp, pids, i) {
1566 pid_t pid = PTR_TO_PID(pidp);
1569 /* First, attach the PID to the main cgroup hierarchy */
1570 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1572 log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1574 if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1577 /* If we are in a user instance, and we can't move the process ourselves due to
1578 * permission problems, let's ask the system instance about it instead. Since it's more
1579 * privileged it might be able to move the process across the leaves of a subtree who's
1580 * top node is not owned by us. */
1582 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1584 log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1586 continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1590 r = q; /* Remember first error */
1595 q = cg_all_unified();
1601 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1602 * innermost realized one */
1604 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1605 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1606 const char *realized;
1608 if (!(u->manager->cgroup_supported & bit))
1611 /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
1612 if (delegated_mask & u->cgroup_realized_mask & bit) {
1613 q = cg_attach(cgroup_controller_to_string(c), p, pid);
1615 continue; /* Success! */
1617 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
1618 pid, p, cgroup_controller_to_string(c));
1621 /* So this controller is either not delegate or realized, or something else weird happened. In
1622 * that case let's attach the PID at least to the closest cgroup up the tree that is
1624 realized = unit_get_realized_cgroup_path(u, bit);
1626 continue; /* Not even realized in the root slice? Then let's not bother */
1628 q = cg_attach(cgroup_controller_to_string(c), realized, pid);
1630 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
1631 pid, realized, cgroup_controller_to_string(c));
1638 static void cgroup_xattr_apply(Unit *u) {
1639 char ids[SD_ID128_STRING_MAX];
1644 if (!MANAGER_IS_SYSTEM(u->manager))
1647 if (sd_id128_is_null(u->invocation_id))
1650 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
1651 "trusted.invocation_id",
1652 sd_id128_to_string(u->invocation_id, ids), 32,
1655 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
1658 static bool unit_has_mask_realized(
1660 CGroupMask target_mask,
1661 CGroupMask enable_mask,
1666 return u->cgroup_realized &&
1667 u->cgroup_realized_mask == target_mask &&
1668 u->cgroup_enabled_mask == enable_mask &&
1669 ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
1670 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
1673 static void unit_add_to_cgroup_realize_queue(Unit *u) {
1676 if (u->in_cgroup_realize_queue)
1679 LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1680 u->in_cgroup_realize_queue = true;
1683 static void unit_remove_from_cgroup_realize_queue(Unit *u) {
1686 if (!u->in_cgroup_realize_queue)
1689 LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
1690 u->in_cgroup_realize_queue = false;
1694 /* Check if necessary controllers and attributes for a unit are in place.
1696 * If so, do nothing.
1697 * If not, create paths, move processes over, and set attributes.
1699 * Returns 0 on success and < 0 on failure. */
1700 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1701 CGroupMask target_mask, enable_mask;
1702 bool needs_bpf, apply_bpf;
1707 unit_remove_from_cgroup_realize_queue(u);
1709 target_mask = unit_get_target_mask(u);
1710 enable_mask = unit_get_enable_mask(u);
1711 needs_bpf = unit_get_needs_bpf(u);
1713 if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
1716 /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
1717 * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
1718 * this will trickle down properly to cgroupfs. */
1719 apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
1721 /* First, realize parents */
1722 if (UNIT_ISSET(u->slice)) {
1723 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1728 /* And then do the real work */
1729 r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
1733 /* Finally, apply the necessary attributes. */
1734 cgroup_context_apply(u, target_mask, apply_bpf, state);
1735 cgroup_xattr_apply(u);
1740 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
1748 state = manager_state(m);
1750 while ((i = m->cgroup_realize_queue)) {
1751 assert(i->in_cgroup_realize_queue);
1753 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
1754 /* Maybe things changed, and the unit is not actually active anymore? */
1755 unit_remove_from_cgroup_realize_queue(i);
1759 r = unit_realize_cgroup_now(i, state);
1761 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1769 static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
1772 /* This adds the siblings of the specified unit and the
1773 * siblings of all parent units to the cgroup queue. (But
1774 * neither the specified unit itself nor the parents.) */
1776 while ((slice = UNIT_DEREF(u->slice))) {
1781 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
1785 /* Skip units that have a dependency on the slice
1786 * but aren't actually in it. */
1787 if (UNIT_DEREF(m->slice) != slice)
1790 /* No point in doing cgroup application for units
1791 * without active processes. */
1792 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1795 /* If the unit doesn't need any new controllers
1796 * and has current ones realized, it doesn't need
1798 if (unit_has_mask_realized(m,
1799 unit_get_target_mask(m),
1800 unit_get_enable_mask(m),
1801 unit_get_needs_bpf(m)))
1804 unit_add_to_cgroup_realize_queue(m);
1811 int unit_realize_cgroup(Unit *u) {
1814 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1817 /* So, here's the deal: when realizing the cgroups for this
1818 * unit, we need to first create all parents, but there's more
1819 * actually: for the weight-based controllers we also need to
1820 * make sure that all our siblings (i.e. units that are in the
1821 * same slice as we are) have cgroups, too. Otherwise, things
1822 * would become very uneven as each of their processes would
1823 * get as much resources as all our group together. This call
1824 * will synchronously create the parent cgroups, but will
1825 * defer work on the siblings to the next event loop
1828 /* Add all sibling slices to the cgroup queue. */
1829 unit_add_siblings_to_cgroup_realize_queue(u);
1831 /* And realize this one now (and apply the values) */
1832 return unit_realize_cgroup_now(u, manager_state(u->manager));
1835 void unit_release_cgroup(Unit *u) {
1838 /* Forgets all cgroup details for this cgroup */
1840 if (u->cgroup_path) {
1841 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1842 u->cgroup_path = mfree(u->cgroup_path);
1845 if (u->cgroup_inotify_wd >= 0) {
1846 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1847 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1849 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1850 u->cgroup_inotify_wd = -1;
1854 void unit_prune_cgroup(Unit *u) {
1860 /* Removes the cgroup, if empty and possible, and stops watching it. */
1862 if (!u->cgroup_path)
1865 (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
1867 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1869 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1871 log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1878 unit_release_cgroup(u);
1880 u->cgroup_realized = false;
1881 u->cgroup_realized_mask = 0;
1882 u->cgroup_enabled_mask = 0;
1885 int unit_search_main_pid(Unit *u, pid_t *ret) {
1886 _cleanup_fclose_ FILE *f = NULL;
1887 pid_t pid = 0, npid, mypid;
1893 if (!u->cgroup_path)
1896 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1900 mypid = getpid_cached();
1901 while (cg_read_pid(f, &npid) > 0) {
1907 /* Ignore processes that aren't our kids */
1908 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1912 /* Dang, there's more than one daemonized PID
1913 in this group, so we don't know what process
1914 is the main process. */
1925 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1926 _cleanup_closedir_ DIR *d = NULL;
1927 _cleanup_fclose_ FILE *f = NULL;
1933 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1939 while ((r = cg_read_pid(f, &pid)) > 0) {
1940 r = unit_watch_pid(u, pid);
1941 if (r < 0 && ret >= 0)
1945 if (r < 0 && ret >= 0)
1949 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1956 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1957 _cleanup_free_ char *p = NULL;
1959 p = strjoin(path, "/", fn);
1965 r = unit_watch_pids_in_path(u, p);
1966 if (r < 0 && ret >= 0)
1970 if (r < 0 && ret >= 0)
1977 int unit_synthesize_cgroup_empty_event(Unit *u) {
1982 /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
1983 * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
1984 * get as notification source as soon as we stopped having any useful PIDs to watch for. */
1986 if (!u->cgroup_path)
1989 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1992 if (r > 0) /* On unified we have reliable notifications, and don't need this */
1995 if (!set_isempty(u->pids))
1998 unit_add_to_cgroup_empty_queue(u);
2002 int unit_watch_all_pids(Unit *u) {
2007 /* Adds all PIDs from our cgroup to the set of PIDs we
2008 * watch. This is a fallback logic for cases where we do not
2009 * get reliable cgroup empty notifications: we try to use
2010 * SIGCHLD as replacement. */
2012 if (!u->cgroup_path)
2015 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2018 if (r > 0) /* On unified we can use proper notifications */
2021 return unit_watch_pids_in_path(u, u->cgroup_path);
2024 static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2025 Manager *m = userdata;
2032 u = m->cgroup_empty_queue;
2036 assert(u->in_cgroup_empty_queue);
2037 u->in_cgroup_empty_queue = false;
2038 LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2040 if (m->cgroup_empty_queue) {
2041 /* More stuff queued, let's make sure we remain enabled */
2042 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2044 log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
2047 unit_add_to_gc_queue(u);
2049 if (UNIT_VTABLE(u)->notify_cgroup_empty)
2050 UNIT_VTABLE(u)->notify_cgroup_empty(u);
2055 void unit_add_to_cgroup_empty_queue(Unit *u) {
2060 /* Note that there are four different ways how cgroup empty events reach us:
2062 * 1. On the unified hierarchy we get an inotify event on the cgroup
2064 * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2066 * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2068 * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2069 * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2071 * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2072 * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2073 * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2074 * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2075 * case for scope units). */
2077 if (u->in_cgroup_empty_queue)
2080 /* Let's verify that the cgroup is really empty */
2081 if (!u->cgroup_path)
2083 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2085 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2091 LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2092 u->in_cgroup_empty_queue = true;
2094 /* Trigger the defer event */
2095 r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2097 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2100 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2101 Manager *m = userdata;
2108 union inotify_event_buffer buffer;
2109 struct inotify_event *e;
2112 l = read(fd, &buffer, sizeof(buffer));
2114 if (IN_SET(errno, EINTR, EAGAIN))
2117 return log_error_errno(errno, "Failed to read control group inotify events: %m");
2120 FOREACH_INOTIFY_EVENT(e, buffer, l) {
2124 /* Queue overflow has no watch descriptor */
2127 if (e->mask & IN_IGNORED)
2128 /* The watch was just removed */
2131 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
2132 if (!u) /* Not that inotify might deliver
2133 * events for a watch even after it
2134 * was removed, because it was queued
2135 * before the removal. Let's ignore
2136 * this here safely. */
2139 unit_add_to_cgroup_empty_queue(u);
2145 int manager_setup_cgroup(Manager *m) {
2146 _cleanup_free_ char *path = NULL;
2147 const char *scope_path;
2150 #if 0 /// UNNEEDED by elogind
2156 /* 1. Determine hierarchy */
2157 m->cgroup_root = mfree(m->cgroup_root);
2158 #if 0 /// elogind is not init and must therefore search for PID 1 instead of self.
2159 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2161 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &m->cgroup_root);
2164 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2166 #if 0 /// elogind does not support systemd scopes and slices
2167 /* Chop off the init scope, if we are already located in it */
2168 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2170 /* LEGACY: Also chop off the system slice if we are in
2171 * it. This is to support live upgrades from older systemd
2172 * versions where PID 1 was moved there. Also see
2173 * cg_get_root_path(). */
2174 if (!e && MANAGER_IS_SYSTEM(m)) {
2175 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2177 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2183 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
2184 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
2185 /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2186 * easily prepend it everywhere. */
2187 delete_trailing_chars(m->cgroup_root, "/");
2190 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2192 return log_error_errno(r, "Cannot find cgroup mount point: %m");
2194 r = cg_unified_flush();
2196 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2198 all_unified = cg_all_unified();
2199 if (all_unified < 0)
2200 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2201 if (all_unified > 0)
2202 log_debug("Unified cgroup hierarchy is located at %s.", path);
2204 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2206 return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2208 log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2210 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2213 #if 0 /// elogind is not init, and does not install the agent here.
2214 /* 3. Allocate cgroup empty defer event source */
2215 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2216 r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2218 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2220 r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2222 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2224 r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2226 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2228 (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2230 /* 4. Install notifier inotify object, or agent */
2231 if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2233 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2235 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2236 safe_close(m->cgroup_inotify_fd);
2238 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2239 if (m->cgroup_inotify_fd < 0)
2240 return log_error_errno(errno, "Failed to create control group inotify object: %m");
2242 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2244 return log_error_errno(r, "Failed to watch control group inotify object: %m");
2246 /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
2247 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2248 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
2250 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2252 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2254 } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) {
2256 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2257 * since it does not generate events when control groups with children run empty. */
2259 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2261 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2263 log_debug("Installed release agent.");
2265 log_debug("Release agent already installed.");
2268 /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2269 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2270 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2273 * This method is in core, and normally called by systemd
2274 * being init. As elogind is never init, we can not install
2275 * our agent here. We do so when mounting our cgroup file
2276 * system, so only if elogind is its own tiny controller.
2277 * Further, elogind is not meant to run in systemd init scope. */
2278 if (MANAGER_IS_SYSTEM(m))
2279 // we are our own cgroup controller
2280 scope_path = strjoina("");
2281 else if (streq(m->cgroup_root, "/elogind"))
2282 // root already is our cgroup
2283 scope_path = strjoina(m->cgroup_root);
2285 // we have to create our own group
2286 scope_path = strjoina(m->cgroup_root, "/elogind");
2287 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2290 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2291 log_debug_elogind("Created control group \"%s\"", scope_path);
2293 #if 0 /// elogind is not a "sub-controller" like systemd, so migration is not needed.
2294 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2295 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2297 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2300 /* 6. And pin it, so that it cannot be unmounted */
2301 safe_close(m->pin_cgroupfs_fd);
2302 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2303 if (m->pin_cgroupfs_fd < 0)
2304 return log_error_errno(errno, "Failed to open pin file: %m");
2306 /* 7. Always enable hierarchical support if it exists... */
2307 if (!all_unified && m->test_run_flags == 0)
2308 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2310 /* 8. Figure out which controllers are supported, and log about it */
2311 r = cg_mask_supported(&m->cgroup_supported);
2313 return log_error_errno(r, "Failed to determine supported controllers: %m");
2314 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2315 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2320 void manager_shutdown_cgroup(Manager *m, bool delete) {
2323 /* We can't really delete the group, since we are in it. But
2325 if (delete && m->cgroup_root)
2326 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2328 #if 0 /// elogind is not init
2329 m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2331 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
2333 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2334 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2337 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2339 m->cgroup_root = mfree(m->cgroup_root);
2342 #if 0 /// UNNEEDED by elogind
2343 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2350 u = hashmap_get(m->cgroup_unit, cgroup);
2354 p = strdupa(cgroup);
2358 e = strrchr(p, '/');
2360 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2364 u = hashmap_get(m->cgroup_unit, p);
2370 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
2371 _cleanup_free_ char *cgroup = NULL;
2375 if (!pid_is_valid(pid))
2378 if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
2381 return manager_get_unit_by_cgroup(m, cgroup);
2384 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
2389 /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
2390 * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
2391 * relevant one as children of the process will be assigned to that one, too, before all else. */
2393 if (!pid_is_valid(pid))
2396 if (pid == getpid_cached())
2397 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
2399 u = manager_get_unit_by_pid_cgroup(m, pid);
2403 u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
2407 array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
2415 #if 0 /// elogind must substitute this with its own variant
2416 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2422 /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
2423 * or from the --system instance */
2425 log_debug("Got cgroup empty notification for: %s", cgroup);
2427 u = manager_get_unit_by_cgroup(m, cgroup);
2431 unit_add_to_cgroup_empty_queue(u);
2435 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
2441 log_debug("Got cgroup empty notification for: %s", cgroup);
2443 s = hashmap_get(m->sessions, cgroup);
2446 session_finalize(s);
2449 log_warning("Session not found: %s", cgroup);
2454 #if 0 /// UNNEEDED by elogind
2455 int unit_get_memory_current(Unit *u, uint64_t *ret) {
2456 _cleanup_free_ char *v = NULL;
2462 if (!UNIT_CGROUP_BOOL(u, memory_accounting))
2465 if (!u->cgroup_path)
2468 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
2471 r = cg_all_unified();
2475 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
2477 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
2483 return safe_atou64(v, ret);
2486 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
2487 _cleanup_free_ char *v = NULL;
2493 if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
2496 if (!u->cgroup_path)
2499 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
2502 /* The root cgroup doesn't expose this information, let's get it from /proc instead */
2503 if (unit_has_root_cgroup(u))
2504 return procfs_tasks_get_current(ret);
2506 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
2512 return safe_atou64(v, ret);
2515 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
2516 _cleanup_free_ char *v = NULL;
2523 if (!u->cgroup_path)
2526 r = cg_all_unified();
2530 const char *keys[] = { "usage_usec", NULL };
2531 _cleanup_free_ char *val = NULL;
2534 if ((u->cgroup_realized_mask & CGROUP_MASK_CPU) == 0)
2537 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", keys, &val);
2541 r = safe_atou64(val, &us);
2545 ns = us * NSEC_PER_USEC;
2547 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
2550 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
2556 r = safe_atou64(v, &ns);
2565 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
2571 /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
2572 * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
2573 * call this function with a NULL return value. */
2575 if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
2578 r = unit_get_cpu_usage_raw(u, &ns);
2579 if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
2580 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
2584 *ret = u->cpu_usage_last;
2590 if (ns > u->cpu_usage_base)
2591 ns -= u->cpu_usage_base;
2595 u->cpu_usage_last = ns;
2602 int unit_get_ip_accounting(
2604 CGroupIPAccountingMetric metric,
2611 assert(metric >= 0);
2612 assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
2615 if (!UNIT_CGROUP_BOOL(u, ip_accounting))
2618 fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
2619 u->ip_accounting_ingress_map_fd :
2620 u->ip_accounting_egress_map_fd;
2624 if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
2625 r = bpf_firewall_read_accounting(fd, &value, NULL);
2627 r = bpf_firewall_read_accounting(fd, NULL, &value);
2631 /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
2632 * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
2633 * ip_accounting_extra[] field, and add them in here transparently. */
2635 *ret = value + u->ip_accounting_extra[metric];
2640 int unit_reset_cpu_accounting(Unit *u) {
2646 u->cpu_usage_last = NSEC_INFINITY;
2648 r = unit_get_cpu_usage_raw(u, &ns);
2650 u->cpu_usage_base = 0;
2654 u->cpu_usage_base = ns;
2658 int unit_reset_ip_accounting(Unit *u) {
2663 if (u->ip_accounting_ingress_map_fd >= 0)
2664 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
2666 if (u->ip_accounting_egress_map_fd >= 0)
2667 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
2669 zero(u->ip_accounting_extra);
2671 return r < 0 ? r : q;
2674 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
2677 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2683 /* always invalidate compat pairs together */
2684 if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
2685 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
2687 if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
2688 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
2690 if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
2693 u->cgroup_realized_mask &= ~m;
2694 unit_add_to_cgroup_realize_queue(u);
2697 void unit_invalidate_cgroup_bpf(Unit *u) {
2700 if (!UNIT_HAS_CGROUP_CONTEXT(u))
2703 if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
2706 u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
2707 unit_add_to_cgroup_realize_queue(u);
2709 /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
2710 * list of our children includes our own. */
2711 if (u->type == UNIT_SLICE) {
2716 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
2720 if (UNIT_DEREF(member->slice) != u)
2723 unit_invalidate_cgroup_bpf(member);
2728 bool unit_cgroup_delegate(Unit *u) {
2733 if (!UNIT_VTABLE(u)->can_delegate)
2736 c = unit_get_cgroup_context(u);
2743 void manager_invalidate_startup_units(Manager *m) {
2749 SET_FOREACH(u, m->startup_units, i)
2750 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
2753 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
2754 [CGROUP_AUTO] = "auto",
2755 [CGROUP_CLOSED] = "closed",
2756 [CGROUP_STRICT] = "strict",
2759 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);