1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include "alloc-util.h"
26 #include "cgroup-util.h"
31 #include "parse-util.h"
32 #include "path-util.h"
33 #include "process-util.h"
34 //#include "special.h"
35 #include "string-table.h"
36 #include "string-util.h"
38 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
40 #if 0 /// UNNEEDED by elogind
41 void cgroup_context_init(CGroupContext *c) {
44 /* Initialize everything to the kernel defaults, assuming the
45 * structure is preinitialized to 0 */
47 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
48 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
49 c->cpu_quota_per_sec_usec = USEC_INFINITY;
51 c->memory_limit = (uint64_t) -1;
53 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
54 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
56 c->tasks_max = (uint64_t) -1;
58 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
61 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
65 LIST_REMOVE(device_allow, c->device_allow, a);
70 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
74 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
79 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
83 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
88 void cgroup_context_done(CGroupContext *c) {
91 while (c->blockio_device_weights)
92 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
94 while (c->blockio_device_bandwidths)
95 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
97 while (c->device_allow)
98 cgroup_context_free_device_allow(c, c->device_allow);
101 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
102 CGroupBlockIODeviceBandwidth *b;
103 CGroupBlockIODeviceWeight *w;
104 CGroupDeviceAllow *a;
105 char u[FORMAT_TIMESPAN_MAX];
110 prefix = strempty(prefix);
113 "%sCPUAccounting=%s\n"
114 "%sBlockIOAccounting=%s\n"
115 "%sMemoryAccounting=%s\n"
116 "%sTasksAccounting=%s\n"
117 "%sCPUShares=%" PRIu64 "\n"
118 "%sStartupCPUShares=%" PRIu64 "\n"
119 "%sCPUQuotaPerSecSec=%s\n"
120 "%sBlockIOWeight=%" PRIu64 "\n"
121 "%sStartupBlockIOWeight=%" PRIu64 "\n"
122 "%sMemoryLimit=%" PRIu64 "\n"
123 "%sTasksMax=%" PRIu64 "\n"
124 "%sDevicePolicy=%s\n"
126 prefix, yes_no(c->cpu_accounting),
127 prefix, yes_no(c->blockio_accounting),
128 prefix, yes_no(c->memory_accounting),
129 prefix, yes_no(c->tasks_accounting),
130 prefix, c->cpu_shares,
131 prefix, c->startup_cpu_shares,
132 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
133 prefix, c->blockio_weight,
134 prefix, c->startup_blockio_weight,
135 prefix, c->memory_limit,
136 prefix, c->tasks_max,
137 prefix, cgroup_device_policy_to_string(c->device_policy),
138 prefix, yes_no(c->delegate));
140 LIST_FOREACH(device_allow, a, c->device_allow)
142 "%sDeviceAllow=%s %s%s%s\n",
145 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
147 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
149 "%sBlockIODeviceWeight=%s %" PRIu64,
154 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
155 char buf[FORMAT_BYTES_MAX];
160 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
162 format_bytes(buf, sizeof(buf), b->bandwidth));
166 static int lookup_blkio_device(const char *p, dev_t *dev) {
175 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
177 if (S_ISBLK(st.st_mode))
179 else if (major(st.st_dev) != 0) {
180 /* If this is not a device node then find the block
181 * device this file is stored on */
184 /* If this is a partition, try to get the originating
186 block_get_whole_disk(*dev, dev);
188 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
195 static int whitelist_device(const char *path, const char *node, const char *acc) {
196 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
203 if (stat(node, &st) < 0) {
204 log_warning("Couldn't stat device %s", node);
208 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
209 log_warning("%s is not a device.", node);
215 S_ISCHR(st.st_mode) ? 'c' : 'b',
216 major(st.st_rdev), minor(st.st_rdev),
219 r = cg_set_attribute("devices", path, "devices.allow", buf);
221 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
222 "Failed to set devices.allow on %s: %m", path);
227 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
228 _cleanup_fclose_ FILE *f = NULL;
235 assert(type == 'b' || type == 'c');
237 f = fopen("/proc/devices", "re");
239 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
241 FOREACH_LINE(line, f, goto fail) {
242 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
247 if (type == 'c' && streq(line, "Character devices:")) {
252 if (type == 'b' && streq(line, "Block devices:")) {
267 w = strpbrk(p, WHITESPACE);
272 r = safe_atou(p, &maj);
279 w += strspn(w, WHITESPACE);
281 if (fnmatch(name, w, 0) != 0)
290 r = cg_set_attribute("devices", path, "devices.allow", buf);
292 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
293 "Failed to set devices.allow on %s: %m", path);
299 log_warning_errno(errno, "Failed to read /proc/devices: %m");
303 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
313 /* Some cgroup attributes are not supported on the root cgroup,
314 * hence silently ignore */
315 is_root = isempty(path) || path_equal(path, "/");
317 /* Make sure we don't try to display messages with an empty path. */
320 /* We generally ignore errors caused by read-only mounted
321 * cgroup trees (assuming we are running in a container then),
322 * and missing cgroups, i.e. EROFS and ENOENT. */
324 if ((mask & CGROUP_MASK_CPU) && !is_root) {
325 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
327 sprintf(buf, "%" PRIu64 "\n",
328 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
329 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
330 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
332 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
333 "Failed to set cpu.shares on %s: %m", path);
335 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
336 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
338 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
339 "Failed to set cpu.cfs_period_us on %s: %m", path);
341 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
342 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
343 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
345 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
347 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
348 "Failed to set cpu.cfs_quota_us on %s: %m", path);
351 if (mask & CGROUP_MASK_BLKIO) {
352 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
353 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
354 CGroupBlockIODeviceWeight *w;
355 CGroupBlockIODeviceBandwidth *b;
358 sprintf(buf, "%" PRIu64 "\n",
359 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
360 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
361 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
363 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
364 "Failed to set blkio.weight on %s: %m", path);
366 /* FIXME: no way to reset this list */
367 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
370 r = lookup_blkio_device(w->path, &dev);
374 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
375 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
377 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
378 "Failed to set blkio.weight_device on %s: %m", path);
382 /* FIXME: no way to reset this list */
383 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
387 r = lookup_blkio_device(b->path, &dev);
391 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
393 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
394 r = cg_set_attribute("blkio", path, a, buf);
396 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
397 "Failed to set %s on %s: %m", a, path);
401 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
402 if (c->memory_limit != (uint64_t) -1) {
403 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
405 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
407 if (cg_unified() <= 0)
408 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
410 r = cg_set_attribute("memory", path, "memory.max", buf);
413 if (cg_unified() <= 0)
414 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
416 r = cg_set_attribute("memory", path, "memory.max", "max");
420 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
421 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
424 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
425 CGroupDeviceAllow *a;
427 /* Changing the devices list of a populated cgroup
428 * might result in EINVAL, hence ignore EINVAL
431 if (c->device_allow || c->device_policy != CGROUP_AUTO)
432 r = cg_set_attribute("devices", path, "devices.deny", "a");
434 r = cg_set_attribute("devices", path, "devices.allow", "a");
436 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
437 "Failed to reset devices.list on %s: %m", path);
439 if (c->device_policy == CGROUP_CLOSED ||
440 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
441 static const char auto_devices[] =
442 "/dev/null\0" "rwm\0"
443 "/dev/zero\0" "rwm\0"
444 "/dev/full\0" "rwm\0"
445 "/dev/random\0" "rwm\0"
446 "/dev/urandom\0" "rwm\0"
448 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
452 NULSTR_FOREACH_PAIR(x, y, auto_devices)
453 whitelist_device(path, x, y);
455 whitelist_major(path, "pts", 'c', "rw");
456 whitelist_major(path, "kdbus", 'c', "rw");
457 whitelist_major(path, "kdbus/*", 'c', "rw");
460 LIST_FOREACH(device_allow, a, c->device_allow) {
476 if (startswith(a->path, "/dev/"))
477 whitelist_device(path, a->path, acc);
478 else if (startswith(a->path, "block-"))
479 whitelist_major(path, a->path + 6, 'b', acc);
480 else if (startswith(a->path, "char-"))
481 whitelist_major(path, a->path + 5, 'c', acc);
483 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
487 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
489 if (c->tasks_max != (uint64_t) -1) {
490 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
492 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
493 r = cg_set_attribute("pids", path, "pids.max", buf);
495 r = cg_set_attribute("pids", path, "pids.max", "max");
498 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
499 "Failed to set pids.max on %s: %m", path);
502 if (mask & CGROUP_MASK_NET_CLS) {
503 char buf[DECIMAL_STR_MAX(uint32_t)];
505 sprintf(buf, "%" PRIu32, netclass);
507 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
509 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
510 "Failed to set net_cls.classid on %s: %m", path);
514 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
517 /* Figure out which controllers we need */
519 if (c->cpu_accounting ||
520 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
521 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
522 c->cpu_quota_per_sec_usec != USEC_INFINITY)
523 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
525 if (c->blockio_accounting ||
526 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
527 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
528 c->blockio_device_weights ||
529 c->blockio_device_bandwidths)
530 mask |= CGROUP_MASK_BLKIO;
532 if (c->memory_accounting ||
533 c->memory_limit != (uint64_t) -1)
534 mask |= CGROUP_MASK_MEMORY;
536 if (c->device_allow ||
537 c->device_policy != CGROUP_AUTO)
538 mask |= CGROUP_MASK_DEVICES;
540 if (c->tasks_accounting ||
541 c->tasks_max != (uint64_t) -1)
542 mask |= CGROUP_MASK_PIDS;
544 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
545 mask |= CGROUP_MASK_NET_CLS;
550 CGroupMask unit_get_own_mask(Unit *u) {
553 /* Returns the mask of controllers the unit needs for itself */
555 c = unit_get_cgroup_context(u);
559 /* If delegation is turned on, then turn on all cgroups,
560 * unless we are on the legacy hierarchy and the process we
561 * fork into it is known to drop privileges, and hence
562 * shouldn't get access to the controllers.
564 * Note that on the unified hierarchy it is safe to delegate
565 * controllers to unprivileged services. */
570 e = unit_get_exec_context(u);
572 exec_context_maintains_privileges(e) ||
574 return _CGROUP_MASK_ALL;
577 return cgroup_context_get_mask(c);
580 CGroupMask unit_get_members_mask(Unit *u) {
583 /* Returns the mask of controllers all of the unit's children
586 if (u->cgroup_members_mask_valid)
587 return u->cgroup_members_mask;
589 u->cgroup_members_mask = 0;
591 if (u->type == UNIT_SLICE) {
595 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
600 if (UNIT_DEREF(member->slice) != u)
603 u->cgroup_members_mask |=
604 unit_get_own_mask(member) |
605 unit_get_members_mask(member);
609 u->cgroup_members_mask_valid = true;
610 return u->cgroup_members_mask;
613 CGroupMask unit_get_siblings_mask(Unit *u) {
616 /* Returns the mask of controllers all of the unit's siblings
617 * require, i.e. the members mask of the unit's parent slice
618 * if there is one. */
620 if (UNIT_ISSET(u->slice))
621 return unit_get_members_mask(UNIT_DEREF(u->slice));
623 return unit_get_own_mask(u) | unit_get_members_mask(u);
626 CGroupMask unit_get_subtree_mask(Unit *u) {
628 /* Returns the mask of this subtree, meaning of the group
629 * itself and its children. */
631 return unit_get_own_mask(u) | unit_get_members_mask(u);
634 CGroupMask unit_get_target_mask(Unit *u) {
637 /* This returns the cgroup mask of all controllers to enable
638 * for a specific cgroup, i.e. everything it needs itself,
639 * plus all that its children need, plus all that its siblings
640 * need. This is primarily useful on the legacy cgroup
641 * hierarchy, where we need to duplicate each cgroup in each
642 * hierarchy that shall be enabled for it. */
644 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
645 mask &= u->manager->cgroup_supported;
650 CGroupMask unit_get_enable_mask(Unit *u) {
653 /* This returns the cgroup mask of all controllers to enable
654 * for the children of a specific cgroup. This is primarily
655 * useful for the unified cgroup hierarchy, where each cgroup
656 * controls which controllers are enabled for its children. */
658 mask = unit_get_members_mask(u);
659 mask &= u->manager->cgroup_supported;
664 /* Recurse from a unit up through its containing slices, propagating
665 * mask bits upward. A unit is also member of itself. */
666 void unit_update_cgroup_members_masks(Unit *u) {
672 /* Calculate subtree mask */
673 m = unit_get_subtree_mask(u);
675 /* See if anything changed from the previous invocation. If
676 * not, we're done. */
677 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
681 u->cgroup_subtree_mask_valid &&
682 ((m & ~u->cgroup_subtree_mask) != 0) &&
683 ((~m & u->cgroup_subtree_mask) == 0);
685 u->cgroup_subtree_mask = m;
686 u->cgroup_subtree_mask_valid = true;
688 if (UNIT_ISSET(u->slice)) {
689 Unit *s = UNIT_DEREF(u->slice);
692 /* There's more set now than before. We
693 * propagate the new mask to the parent's mask
694 * (not caring if it actually was valid or
697 s->cgroup_members_mask |= m;
700 /* There's less set now than before (or we
701 * don't know), we need to recalculate
702 * everything, so let's invalidate the
703 * parent's members mask */
705 s->cgroup_members_mask_valid = false;
707 /* And now make sure that this change also hits our
709 unit_update_cgroup_members_masks(s);
713 static const char *migrate_callback(CGroupMask mask, void *userdata) {
720 if (u->cgroup_path &&
721 u->cgroup_realized &&
722 (u->cgroup_realized_mask & mask) == mask)
723 return u->cgroup_path;
725 u = UNIT_DEREF(u->slice);
731 char *unit_default_cgroup_path(Unit *u) {
732 _cleanup_free_ char *escaped = NULL, *slice = NULL;
737 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
738 return strdup(u->manager->cgroup_root);
740 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
741 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
746 escaped = cg_escape(u->id);
751 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
753 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
756 int unit_set_cgroup_path(Unit *u, const char *path) {
757 _cleanup_free_ char *p = NULL;
769 if (streq_ptr(u->cgroup_path, p))
773 r = hashmap_put(u->manager->cgroup_unit, p, u);
778 unit_release_cgroup(u);
786 int unit_watch_cgroup(Unit *u) {
787 _cleanup_free_ char *populated = NULL;
795 if (u->cgroup_inotify_wd >= 0)
798 /* Only applies to the unified hierarchy */
801 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
805 /* Don't watch the root slice, it's pointless. */
806 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
809 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
813 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
817 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
818 if (u->cgroup_inotify_wd < 0) {
820 if (errno == ENOENT) /* If the directory is already
821 * gone we don't need to track
822 * it, so this is not an error */
825 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
828 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
830 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
835 static int unit_create_cgroup(
837 CGroupMask target_mask,
838 CGroupMask enable_mask) {
845 c = unit_get_cgroup_context(u);
849 if (!u->cgroup_path) {
850 _cleanup_free_ char *path = NULL;
852 path = unit_default_cgroup_path(u);
856 r = unit_set_cgroup_path(u, path);
858 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
860 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
863 /* First, create our own group */
864 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
866 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
868 /* Start watching it */
869 (void) unit_watch_cgroup(u);
871 /* Enable all controllers we need */
872 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
874 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
876 /* Keep track that this is now realized */
877 u->cgroup_realized = true;
878 u->cgroup_realized_mask = target_mask;
880 if (u->type != UNIT_SLICE && !c->delegate) {
882 /* Then, possibly move things over, but not if
883 * subgroups may contain processes, which is the case
884 * for slice and delegation units. */
885 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
887 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
893 int unit_attach_pids_to_cgroup(Unit *u) {
897 r = unit_realize_cgroup(u);
901 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
908 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
911 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
914 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
923 i = start = m->cgroup_netclass_registry_last;
928 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
929 m->cgroup_netclass_registry_last = i;
935 i = CGROUP_NETCLASS_FIXED_MAX;
937 } while (i != start);
942 int unit_add_to_netclass_cgroup(Unit *u) {
951 cc = unit_get_cgroup_context(u);
955 switch (cc->netclass_type) {
956 case CGROUP_NETCLASS_TYPE_NONE:
959 case CGROUP_NETCLASS_TYPE_FIXED:
960 u->cgroup_netclass_id = cc->netclass_id;
963 case CGROUP_NETCLASS_TYPE_AUTO:
964 /* Allocate a new ID in case it was requested and not done yet */
965 if (u->cgroup_netclass_id == 0) {
966 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
970 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
976 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
980 key = UINT32_TO_PTR(u->cgroup_netclass_id);
981 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
984 LIST_PREPEND(cgroup_netclass, first, u);
985 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
988 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
991 int unit_remove_from_netclass_cgroup(Unit *u) {
998 key = UINT32_TO_PTR(u->cgroup_netclass_id);
1000 LIST_FIND_HEAD(cgroup_netclass, u, head);
1001 LIST_REMOVE(cgroup_netclass, head, u);
1004 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1006 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1011 /* Check if necessary controllers and attributes for a unit are in place.
1013 * If so, do nothing.
1014 * If not, create paths, move processes over, and set attributes.
1016 * Returns 0 on success and < 0 on failure. */
1017 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1018 CGroupMask target_mask, enable_mask;
1023 if (u->in_cgroup_queue) {
1024 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1025 u->in_cgroup_queue = false;
1028 target_mask = unit_get_target_mask(u);
1029 if (unit_has_mask_realized(u, target_mask))
1032 /* First, realize parents */
1033 if (UNIT_ISSET(u->slice)) {
1034 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1039 /* And then do the real work */
1040 enable_mask = unit_get_enable_mask(u);
1041 r = unit_create_cgroup(u, target_mask, enable_mask);
1045 /* Finally, apply the necessary attributes. */
1046 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1051 static void unit_add_to_cgroup_queue(Unit *u) {
1053 if (u->in_cgroup_queue)
1056 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1057 u->in_cgroup_queue = true;
1060 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1066 state = manager_state(m);
1068 while ((i = m->cgroup_queue)) {
1069 assert(i->in_cgroup_queue);
1071 r = unit_realize_cgroup_now(i, state);
1073 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1081 static void unit_queue_siblings(Unit *u) {
1084 /* This adds the siblings of the specified unit and the
1085 * siblings of all parent units to the cgroup queue. (But
1086 * neither the specified unit itself nor the parents.) */
1088 while ((slice = UNIT_DEREF(u->slice))) {
1092 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1096 /* Skip units that have a dependency on the slice
1097 * but aren't actually in it. */
1098 if (UNIT_DEREF(m->slice) != slice)
1101 /* No point in doing cgroup application for units
1102 * without active processes. */
1103 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1106 /* If the unit doesn't need any new controllers
1107 * and has current ones realized, it doesn't need
1109 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1112 unit_add_to_cgroup_queue(m);
1119 int unit_realize_cgroup(Unit *u) {
1122 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1125 /* So, here's the deal: when realizing the cgroups for this
1126 * unit, we need to first create all parents, but there's more
1127 * actually: for the weight-based controllers we also need to
1128 * make sure that all our siblings (i.e. units that are in the
1129 * same slice as we are) have cgroups, too. Otherwise, things
1130 * would become very uneven as each of their processes would
1131 * get as much resources as all our group together. This call
1132 * will synchronously create the parent cgroups, but will
1133 * defer work on the siblings to the next event loop
1136 /* Add all sibling slices to the cgroup queue. */
1137 unit_queue_siblings(u);
1139 /* And realize this one now (and apply the values) */
1140 return unit_realize_cgroup_now(u, manager_state(u->manager));
1143 void unit_release_cgroup(Unit *u) {
1146 /* Forgets all cgroup details for this cgroup */
1148 if (u->cgroup_path) {
1149 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1150 u->cgroup_path = mfree(u->cgroup_path);
1153 if (u->cgroup_inotify_wd >= 0) {
1154 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1155 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1157 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1158 u->cgroup_inotify_wd = -1;
1162 void unit_prune_cgroup(Unit *u) {
1168 /* Removes the cgroup, if empty and possible, and stops watching it. */
1170 if (!u->cgroup_path)
1173 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1175 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1177 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1184 unit_release_cgroup(u);
1186 u->cgroup_realized = false;
1187 u->cgroup_realized_mask = 0;
1190 int unit_search_main_pid(Unit *u, pid_t *ret) {
1191 _cleanup_fclose_ FILE *f = NULL;
1192 pid_t pid = 0, npid, mypid;
1198 if (!u->cgroup_path)
1201 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1206 while (cg_read_pid(f, &npid) > 0) {
1212 /* Ignore processes that aren't our kids */
1213 if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
1217 /* Dang, there's more than one daemonized PID
1218 in this group, so we don't know what process
1219 is the main process. */
1230 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1231 _cleanup_closedir_ DIR *d = NULL;
1232 _cleanup_fclose_ FILE *f = NULL;
1238 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1244 while ((r = cg_read_pid(f, &pid)) > 0) {
1245 r = unit_watch_pid(u, pid);
1246 if (r < 0 && ret >= 0)
1250 if (r < 0 && ret >= 0)
1254 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1261 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1262 _cleanup_free_ char *p = NULL;
1264 p = strjoin(path, "/", fn, NULL);
1270 r = unit_watch_pids_in_path(u, p);
1271 if (r < 0 && ret >= 0)
1275 if (r < 0 && ret >= 0)
1282 int unit_watch_all_pids(Unit *u) {
1285 /* Adds all PIDs from our cgroup to the set of PIDs we
1286 * watch. This is a fallback logic for cases where we do not
1287 * get reliable cgroup empty notifications: we try to use
1288 * SIGCHLD as replacement. */
1290 if (!u->cgroup_path)
1293 if (cg_unified() > 0) /* On unified we can use proper notifications */
1296 return unit_watch_pids_in_path(u, u->cgroup_path);
1299 int unit_notify_cgroup_empty(Unit *u) {
1304 if (!u->cgroup_path)
1307 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1311 unit_add_to_gc_queue(u);
1313 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1314 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1319 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1320 Manager *m = userdata;
1327 union inotify_event_buffer buffer;
1328 struct inotify_event *e;
1331 l = read(fd, &buffer, sizeof(buffer));
1333 if (errno == EINTR || errno == EAGAIN)
1336 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1339 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1343 /* Queue overflow has no watch descriptor */
1346 if (e->mask & IN_IGNORED)
1347 /* The watch was just removed */
1350 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1351 if (!u) /* Not that inotify might deliver
1352 * events for a watch even after it
1353 * was removed, because it was queued
1354 * before the removal. Let's ignore
1355 * this here safely. */
1358 (void) unit_notify_cgroup_empty(u);
1364 int manager_setup_cgroup(Manager *m) {
1365 _cleanup_free_ char *path = NULL;
1372 /* 1. Determine hierarchy */
1373 m->cgroup_root = mfree(m->cgroup_root);
1374 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1376 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1378 #if 0 /// elogind does not support systemd scopes and slices
1379 /* Chop off the init scope, if we are already located in it */
1380 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1382 /* LEGACY: Also chop off the system slice if we are in
1383 * it. This is to support live upgrades from older systemd
1384 * versions where PID 1 was moved there. Also see
1385 * cg_get_root_path(). */
1386 if (!e && m->running_as == MANAGER_SYSTEM) {
1387 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1389 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1395 /* And make sure to store away the root value without trailing
1396 * slash, even for the root dir, so that we can easily prepend
1398 while ((e = endswith(m->cgroup_root, "/")))
1400 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1401 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1404 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1406 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1408 unified = cg_unified();
1410 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1412 log_debug("Unified cgroup hierarchy is located at %s.", path);
1414 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1417 const char *scope_path;
1419 /* 3. Install agent */
1422 /* In the unified hierarchy we can can get
1423 * cgroup empty notifications via inotify. */
1425 #if 0 /// elogind does not support the unified hierarchy, yet.
1426 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1427 safe_close(m->cgroup_inotify_fd);
1429 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1430 if (m->cgroup_inotify_fd < 0)
1431 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1433 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1435 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1437 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1439 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1441 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1444 return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1446 } else if (m->running_as == MANAGER_SYSTEM) {
1448 /* On the legacy hierarchy we only get
1449 * notifications via cgroup agents. (Which
1450 * isn't really reliable, since it does not
1451 * generate events when control groups with
1452 * children run empty. */
1454 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1456 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1458 log_debug("Installed release agent.");
1460 log_debug("Release agent already installed.");
1463 #if 0 /// elogind is not meant to run in systemd init scope
1464 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1465 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1466 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1468 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1469 // we are our own cgroup controller
1470 scope_path = strjoina("");
1471 else if (streq(m->cgroup_root, "/elogind"))
1472 // root already is our cgroup
1473 scope_path = strjoina(m->cgroup_root);
1475 // we have to create our own group
1476 scope_path = strjoina(m->cgroup_root, "/elogind");
1477 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1480 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1481 log_debug_elogind("Created control group \"%s\"", scope_path);
1483 /* also, move all other userspace processes remaining
1484 * in the root cgroup into that scope. */
1485 if (!streq(m->cgroup_root, scope_path)) {
1486 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1488 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1491 /* 5. And pin it, so that it cannot be unmounted */
1492 safe_close(m->pin_cgroupfs_fd);
1493 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1494 if (m->pin_cgroupfs_fd < 0)
1495 return log_error_errno(errno, "Failed to open pin file: %m");
1497 /* 6. Always enable hierarchical support if it exists... */
1499 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1502 /* 7. Figure out which controllers are supported */
1503 r = cg_mask_supported(&m->cgroup_supported);
1505 return log_error_errno(r, "Failed to determine supported controllers: %m");
1507 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1508 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1513 void manager_shutdown_cgroup(Manager *m, bool delete) {
1516 /* We can't really delete the group, since we are in it. But
1518 if (delete && m->cgroup_root)
1519 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1521 #if 0 /// elogind does not support the unified hierarchy, yet.
1522 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1524 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1525 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1528 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1530 m->cgroup_root = mfree(m->cgroup_root);
1533 #if 0 /// UNNEEDED by elogind
1534 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1541 u = hashmap_get(m->cgroup_unit, cgroup);
1545 p = strdupa(cgroup);
1549 e = strrchr(p, '/');
1551 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1555 u = hashmap_get(m->cgroup_unit, p);
1561 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1562 _cleanup_free_ char *cgroup = NULL;
1570 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1574 return manager_get_unit_by_cgroup(m, cgroup);
1577 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1586 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1588 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1592 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1596 return manager_get_unit_by_pid_cgroup(m, pid);
1599 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1605 u = manager_get_unit_by_cgroup(m, cgroup);
1609 return unit_notify_cgroup_empty(u);
1612 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1613 _cleanup_free_ char *v = NULL;
1619 if (!u->cgroup_path)
1622 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1625 if (cg_unified() <= 0)
1626 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1628 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1634 return safe_atou64(v, ret);
1637 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1638 _cleanup_free_ char *v = NULL;
1644 if (!u->cgroup_path)
1647 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1650 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1656 return safe_atou64(v, ret);
1659 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1660 _cleanup_free_ char *v = NULL;
1667 if (!u->cgroup_path)
1670 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1673 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1679 r = safe_atou64(v, &ns);
1687 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1691 r = unit_get_cpu_usage_raw(u, &ns);
1695 if (ns > u->cpuacct_usage_base)
1696 ns -= u->cpuacct_usage_base;
1704 int unit_reset_cpu_usage(Unit *u) {
1710 r = unit_get_cpu_usage_raw(u, &ns);
1712 u->cpuacct_usage_base = 0;
1716 u->cpuacct_usage_base = ns;
1720 bool unit_cgroup_delegate(Unit *u) {
1725 c = unit_get_cgroup_context(u);
1732 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1735 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1741 if ((u->cgroup_realized_mask & m) == 0)
1744 u->cgroup_realized_mask &= ~m;
1745 unit_add_to_cgroup_queue(u);
1748 void manager_invalidate_startup_units(Manager *m) {
1754 SET_FOREACH(u, m->startup_units, i)
1755 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1758 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1759 [CGROUP_AUTO] = "auto",
1760 [CGROUP_CLOSED] = "closed",
1761 [CGROUP_STRICT] = "strict",
1764 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);