1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include "process-util.h"
26 #include "path-util.h"
27 // #include "special.h"
28 #include "cgroup-util.h"
31 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
33 void cgroup_context_init(CGroupContext *c) {
36 /* Initialize everything to the kernel defaults, assuming the
37 * structure is preinitialized to 0 */
39 c->cpu_shares = (unsigned long) -1;
40 c->startup_cpu_shares = (unsigned long) -1;
41 c->memory_limit = (uint64_t) -1;
42 c->blockio_weight = (unsigned long) -1;
43 c->startup_blockio_weight = (unsigned long) -1;
45 c->cpu_quota_per_sec_usec = USEC_INFINITY;
48 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
52 LIST_REMOVE(device_allow, c->device_allow, a);
57 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
61 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
66 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
70 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
75 void cgroup_context_done(CGroupContext *c) {
78 while (c->blockio_device_weights)
79 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
81 while (c->blockio_device_bandwidths)
82 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
84 while (c->device_allow)
85 cgroup_context_free_device_allow(c, c->device_allow);
88 /// UNNEEDED by elogind
90 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
91 CGroupBlockIODeviceBandwidth *b;
92 CGroupBlockIODeviceWeight *w;
94 char u[FORMAT_TIMESPAN_MAX];
99 prefix = strempty(prefix);
102 "%sCPUAccounting=%s\n"
103 "%sBlockIOAccounting=%s\n"
104 "%sMemoryAccounting=%s\n"
106 "%sStartupCPUShares=%lu\n"
107 "%sCPUQuotaPerSecSec=%s\n"
108 "%sBlockIOWeight=%lu\n"
109 "%sStartupBlockIOWeight=%lu\n"
110 "%sMemoryLimit=%" PRIu64 "\n"
111 "%sDevicePolicy=%s\n"
113 prefix, yes_no(c->cpu_accounting),
114 prefix, yes_no(c->blockio_accounting),
115 prefix, yes_no(c->memory_accounting),
116 prefix, c->cpu_shares,
117 prefix, c->startup_cpu_shares,
118 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
119 prefix, c->blockio_weight,
120 prefix, c->startup_blockio_weight,
121 prefix, c->memory_limit,
122 prefix, cgroup_device_policy_to_string(c->device_policy),
123 prefix, yes_no(c->delegate));
125 LIST_FOREACH(device_allow, a, c->device_allow)
127 "%sDeviceAllow=%s %s%s%s\n",
130 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
132 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
134 "%sBlockIODeviceWeight=%s %lu",
139 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
140 char buf[FORMAT_BYTES_MAX];
145 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
147 format_bytes(buf, sizeof(buf), b->bandwidth));
151 static int lookup_blkio_device(const char *p, dev_t *dev) {
160 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
162 if (S_ISBLK(st.st_mode))
164 else if (major(st.st_dev) != 0) {
165 /* If this is not a device node then find the block
166 * device this file is stored on */
169 /* If this is a partition, try to get the originating
171 block_get_whole_disk(*dev, dev);
173 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
180 static int whitelist_device(const char *path, const char *node, const char *acc) {
181 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
188 if (stat(node, &st) < 0) {
189 log_warning("Couldn't stat device %s", node);
193 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
194 log_warning("%s is not a device.", node);
200 S_ISCHR(st.st_mode) ? 'c' : 'b',
201 major(st.st_rdev), minor(st.st_rdev),
204 r = cg_set_attribute("devices", path, "devices.allow", buf);
206 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
207 "Failed to set devices.allow on %s: %m", path);
212 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
213 _cleanup_fclose_ FILE *f = NULL;
220 assert(type == 'b' || type == 'c');
222 f = fopen("/proc/devices", "re");
224 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
226 FOREACH_LINE(line, f, goto fail) {
227 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
232 if (type == 'c' && streq(line, "Character devices:")) {
237 if (type == 'b' && streq(line, "Block devices:")) {
252 w = strpbrk(p, WHITESPACE);
257 r = safe_atou(p, &maj);
264 w += strspn(w, WHITESPACE);
266 if (fnmatch(name, w, 0) != 0)
275 r = cg_set_attribute("devices", path, "devices.allow", buf);
277 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
278 "Failed to set devices.allow on %s: %m", path);
284 log_warning_errno(errno, "Failed to read /proc/devices: %m");
288 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
298 /* Some cgroup attributes are not supported on the root cgroup,
299 * hence silently ignore */
300 is_root = isempty(path) || path_equal(path, "/");
302 /* Make sure we don't try to display messages with an empty path. */
305 /* We generally ignore errors caused by read-only mounted
306 * cgroup trees (assuming we are running in a container then),
307 * and missing cgroups, i.e. EROFS and ENOENT. */
309 if ((mask & CGROUP_MASK_CPU) && !is_root) {
310 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
312 sprintf(buf, "%lu\n",
313 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
314 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
315 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
317 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
318 "Failed to set cpu.shares on %s: %m", path);
320 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
321 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
323 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
324 "Failed to set cpu.cfs_period_us on %s: %m", path);
326 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
327 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
328 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
330 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
332 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
333 "Failed to set cpu.cfs_quota_us on %s: %m", path);
336 if (mask & CGROUP_MASK_BLKIO) {
337 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
338 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
339 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
340 CGroupBlockIODeviceWeight *w;
341 CGroupBlockIODeviceBandwidth *b;
344 sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
345 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
346 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
348 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
349 "Failed to set blkio.weight on %s: %m", path);
351 /* FIXME: no way to reset this list */
352 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
355 r = lookup_blkio_device(w->path, &dev);
359 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
360 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
362 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
363 "Failed to set blkio.weight_device on %s: %m", path);
367 /* FIXME: no way to reset this list */
368 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
372 r = lookup_blkio_device(b->path, &dev);
376 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
378 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
379 r = cg_set_attribute("blkio", path, a, buf);
381 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
382 "Failed to set %s on %s: %m", a, path);
386 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
387 if (c->memory_limit != (uint64_t) -1) {
388 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
390 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
392 if (cg_unified() <= 0)
393 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
395 r = cg_set_attribute("memory", path, "memory.max", buf);
398 if (cg_unified() <= 0)
399 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
401 r = cg_set_attribute("memory", path, "memory.max", "max");
405 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
406 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
409 if ((mask & CGROUP_MASK_DEVICE) && !is_root) {
410 CGroupDeviceAllow *a;
412 /* Changing the devices list of a populated cgroup
413 * might result in EINVAL, hence ignore EINVAL
416 if (c->device_allow || c->device_policy != CGROUP_AUTO)
417 r = cg_set_attribute("devices", path, "devices.deny", "a");
419 r = cg_set_attribute("devices", path, "devices.allow", "a");
421 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
422 "Failed to reset devices.list on %s: %m", path);
424 if (c->device_policy == CGROUP_CLOSED ||
425 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
426 static const char auto_devices[] =
427 "/dev/null\0" "rwm\0"
428 "/dev/zero\0" "rwm\0"
429 "/dev/full\0" "rwm\0"
430 "/dev/random\0" "rwm\0"
431 "/dev/urandom\0" "rwm\0"
433 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
437 NULSTR_FOREACH_PAIR(x, y, auto_devices)
438 whitelist_device(path, x, y);
440 whitelist_major(path, "pts", 'c', "rw");
441 whitelist_major(path, "kdbus", 'c', "rw");
442 whitelist_major(path, "kdbus/*", 'c', "rw");
445 LIST_FOREACH(device_allow, a, c->device_allow) {
461 if (startswith(a->path, "/dev/"))
462 whitelist_device(path, a->path, acc);
463 else if (startswith(a->path, "block-"))
464 whitelist_major(path, a->path + 6, 'b', acc);
465 else if (startswith(a->path, "char-"))
466 whitelist_major(path, a->path + 5, 'c', acc);
468 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
474 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
477 /* Figure out which controllers we need */
479 if (c->cpu_accounting ||
480 c->cpu_shares != (unsigned long) -1 ||
481 c->startup_cpu_shares != (unsigned long) -1 ||
482 c->cpu_quota_per_sec_usec != USEC_INFINITY)
483 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
485 if (c->blockio_accounting ||
486 c->blockio_weight != (unsigned long) -1 ||
487 c->startup_blockio_weight != (unsigned long) -1 ||
488 c->blockio_device_weights ||
489 c->blockio_device_bandwidths)
490 mask |= CGROUP_MASK_BLKIO;
492 if (c->memory_accounting ||
493 c->memory_limit != (uint64_t) -1)
494 mask |= CGROUP_MASK_MEMORY;
496 if (c->device_allow ||
497 c->device_policy != CGROUP_AUTO)
498 mask |= CGROUP_MASK_DEVICE;
503 /// UNNEEDED by elogind
505 CGroupMask unit_get_own_mask(Unit *u) {
508 /* Returns the mask of controllers the unit needs for itself */
510 c = unit_get_cgroup_context(u);
514 /* If delegation is turned on, then turn on all cgroups,
515 * unless we are on the legacy hierarchy and the process we
516 * fork into it is known to drop privileges, and hence
517 * shouldn't get access to the controllers.
519 * Note that on the unified hierarchy it is safe to delegate
520 * controllers to unprivileged services. */
525 e = unit_get_exec_context(u);
527 exec_context_maintains_privileges(e) ||
529 return _CGROUP_MASK_ALL;
532 return cgroup_context_get_mask(c);
535 CGroupMask unit_get_members_mask(Unit *u) {
538 /* Returns the mask of controllers all of the unit's children
541 if (u->cgroup_members_mask_valid)
542 return u->cgroup_members_mask;
544 u->cgroup_members_mask = 0;
546 if (u->type == UNIT_SLICE) {
550 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
555 if (UNIT_DEREF(member->slice) != u)
558 u->cgroup_members_mask |=
559 unit_get_own_mask(member) |
560 unit_get_members_mask(member);
564 u->cgroup_members_mask_valid = true;
565 return u->cgroup_members_mask;
568 CGroupMask unit_get_siblings_mask(Unit *u) {
571 /* Returns the mask of controllers all of the unit's siblings
572 * require, i.e. the members mask of the unit's parent slice
573 * if there is one. */
575 if (UNIT_ISSET(u->slice))
576 return unit_get_members_mask(UNIT_DEREF(u->slice));
578 return unit_get_own_mask(u) | unit_get_members_mask(u);
581 CGroupMask unit_get_subtree_mask(Unit *u) {
583 /* Returns the mask of this subtree, meaning of the group
584 * itself and its children. */
586 return unit_get_own_mask(u) | unit_get_members_mask(u);
589 CGroupMask unit_get_target_mask(Unit *u) {
592 /* This returns the cgroup mask of all controllers to enable
593 * for a specific cgroup, i.e. everything it needs itself,
594 * plus all that its children need, plus all that its siblings
595 * need. This is primarily useful on the legacy cgroup
596 * hierarchy, where we need to duplicate each cgroup in each
597 * hierarchy that shall be enabled for it. */
599 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
600 mask &= u->manager->cgroup_supported;
605 CGroupMask unit_get_enable_mask(Unit *u) {
608 /* This returns the cgroup mask of all controllers to enable
609 * for the children of a specific cgroup. This is primarily
610 * useful for the unified cgroup hierarchy, where each cgroup
611 * controls which controllers are enabled for its children. */
613 mask = unit_get_members_mask(u);
614 mask &= u->manager->cgroup_supported;
619 /* Recurse from a unit up through its containing slices, propagating
620 * mask bits upward. A unit is also member of itself. */
621 void unit_update_cgroup_members_masks(Unit *u) {
627 /* Calculate subtree mask */
628 m = unit_get_subtree_mask(u);
630 /* See if anything changed from the previous invocation. If
631 * not, we're done. */
632 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
636 u->cgroup_subtree_mask_valid &&
637 ((m & ~u->cgroup_subtree_mask) != 0) &&
638 ((~m & u->cgroup_subtree_mask) == 0);
640 u->cgroup_subtree_mask = m;
641 u->cgroup_subtree_mask_valid = true;
643 if (UNIT_ISSET(u->slice)) {
644 Unit *s = UNIT_DEREF(u->slice);
647 /* There's more set now than before. We
648 * propagate the new mask to the parent's mask
649 * (not caring if it actually was valid or
652 s->cgroup_members_mask |= m;
655 /* There's less set now than before (or we
656 * don't know), we need to recalculate
657 * everything, so let's invalidate the
658 * parent's members mask */
660 s->cgroup_members_mask_valid = false;
662 /* And now make sure that this change also hits our
664 unit_update_cgroup_members_masks(s);
668 static const char *migrate_callback(CGroupMask mask, void *userdata) {
675 if (u->cgroup_path &&
676 u->cgroup_realized &&
677 (u->cgroup_realized_mask & mask) == mask)
678 return u->cgroup_path;
680 u = UNIT_DEREF(u->slice);
686 char *unit_default_cgroup_path(Unit *u) {
687 _cleanup_free_ char *escaped = NULL, *slice = NULL;
692 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
693 return strdup(u->manager->cgroup_root);
695 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
696 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
701 escaped = cg_escape(u->id);
706 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
708 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
711 int unit_set_cgroup_path(Unit *u, const char *path) {
712 _cleanup_free_ char *p = NULL;
724 if (streq_ptr(u->cgroup_path, p))
728 r = hashmap_put(u->manager->cgroup_unit, p, u);
733 unit_release_cgroup(u);
741 int unit_watch_cgroup(Unit *u) {
742 _cleanup_free_ char *populated = NULL;
750 if (u->cgroup_inotify_wd >= 0)
753 /* Only applies to the unified hierarchy */
756 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
760 /* Don't watch the root slice, it's pointless. */
761 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
764 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
768 r = cg_get_path(ELOGIND_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
772 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
773 if (u->cgroup_inotify_wd < 0) {
775 if (errno == ENOENT) /* If the directory is already
776 * gone we don't need to track
777 * it, so this is not an error */
780 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
783 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
785 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
790 static int unit_create_cgroup(
792 CGroupMask target_mask,
793 CGroupMask enable_mask) {
800 c = unit_get_cgroup_context(u);
804 if (!u->cgroup_path) {
805 _cleanup_free_ char *path = NULL;
807 path = unit_default_cgroup_path(u);
811 r = unit_set_cgroup_path(u, path);
813 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
815 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
818 /* First, create our own group */
819 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
821 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
823 /* Start watching it */
824 (void) unit_watch_cgroup(u);
826 /* Enable all controllers we need */
827 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
829 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
831 /* Keep track that this is now realized */
832 u->cgroup_realized = true;
833 u->cgroup_realized_mask = target_mask;
835 if (u->type != UNIT_SLICE && !c->delegate) {
837 /* Then, possibly move things over, but not if
838 * subgroups may contain processes, which is the case
839 * for slice and delegation units. */
840 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
842 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
848 int unit_attach_pids_to_cgroup(Unit *u) {
852 r = unit_realize_cgroup(u);
856 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
863 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
866 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
869 /* Check if necessary controllers and attributes for a unit are in place.
872 * If not, create paths, move processes over, and set attributes.
874 * Returns 0 on success and < 0 on failure. */
875 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
876 CGroupMask target_mask, enable_mask;
881 if (u->in_cgroup_queue) {
882 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
883 u->in_cgroup_queue = false;
886 target_mask = unit_get_target_mask(u);
887 if (unit_has_mask_realized(u, target_mask))
890 /* First, realize parents */
891 if (UNIT_ISSET(u->slice)) {
892 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
897 /* And then do the real work */
898 enable_mask = unit_get_enable_mask(u);
899 r = unit_create_cgroup(u, target_mask, enable_mask);
903 /* Finally, apply the necessary attributes. */
904 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
909 static void unit_add_to_cgroup_queue(Unit *u) {
911 if (u->in_cgroup_queue)
914 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
915 u->in_cgroup_queue = true;
918 unsigned manager_dispatch_cgroup_queue(Manager *m) {
924 state = manager_state(m);
926 while ((i = m->cgroup_queue)) {
927 assert(i->in_cgroup_queue);
929 r = unit_realize_cgroup_now(i, state);
931 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
939 static void unit_queue_siblings(Unit *u) {
942 /* This adds the siblings of the specified unit and the
943 * siblings of all parent units to the cgroup queue. (But
944 * neither the specified unit itself nor the parents.) */
946 while ((slice = UNIT_DEREF(u->slice))) {
950 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
954 /* Skip units that have a dependency on the slice
955 * but aren't actually in it. */
956 if (UNIT_DEREF(m->slice) != slice)
959 /* No point in doing cgroup application for units
960 * without active processes. */
961 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
964 /* If the unit doesn't need any new controllers
965 * and has current ones realized, it doesn't need
967 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
970 unit_add_to_cgroup_queue(m);
977 int unit_realize_cgroup(Unit *u) {
980 if (!UNIT_HAS_CGROUP_CONTEXT(u))
983 /* So, here's the deal: when realizing the cgroups for this
984 * unit, we need to first create all parents, but there's more
985 * actually: for the weight-based controllers we also need to
986 * make sure that all our siblings (i.e. units that are in the
987 * same slice as we are) have cgroups, too. Otherwise, things
988 * would become very uneven as each of their processes would
989 * get as much resources as all our group together. This call
990 * will synchronously create the parent cgroups, but will
991 * defer work on the siblings to the next event loop
994 /* Add all sibling slices to the cgroup queue. */
995 unit_queue_siblings(u);
997 /* And realize this one now (and apply the values) */
998 return unit_realize_cgroup_now(u, manager_state(u->manager));
1001 void unit_release_cgroup(Unit *u) {
1004 /* Forgets all cgroup details for this cgroup */
1006 if (u->cgroup_path) {
1007 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1008 u->cgroup_path = mfree(u->cgroup_path);
1011 if (u->cgroup_inotify_wd >= 0) {
1012 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1013 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1015 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1016 u->cgroup_inotify_wd = -1;
1020 void unit_prune_cgroup(Unit *u) {
1026 /* Removes the cgroup, if empty and possible, and stops watching it. */
1028 if (!u->cgroup_path)
1031 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1033 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1035 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1042 unit_release_cgroup(u);
1044 u->cgroup_realized = false;
1045 u->cgroup_realized_mask = 0;
1048 int unit_search_main_pid(Unit *u, pid_t *ret) {
1049 _cleanup_fclose_ FILE *f = NULL;
1050 pid_t pid = 0, npid, mypid;
1056 if (!u->cgroup_path)
1059 r = cg_enumerate_processes(ELOGIND_CGROUP_CONTROLLER, u->cgroup_path, &f);
1064 while (cg_read_pid(f, &npid) > 0) {
1070 /* Ignore processes that aren't our kids */
1071 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1075 /* Dang, there's more than one daemonized PID
1076 in this group, so we don't know what process
1077 is the main process. */
1088 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1089 _cleanup_closedir_ DIR *d = NULL;
1090 _cleanup_fclose_ FILE *f = NULL;
1096 r = cg_enumerate_processes(ELOGIND_CGROUP_CONTROLLER, path, &f);
1102 while ((r = cg_read_pid(f, &pid)) > 0) {
1103 r = unit_watch_pid(u, pid);
1104 if (r < 0 && ret >= 0)
1108 if (r < 0 && ret >= 0)
1112 r = cg_enumerate_subgroups(ELOGIND_CGROUP_CONTROLLER, path, &d);
1119 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1120 _cleanup_free_ char *p = NULL;
1122 p = strjoin(path, "/", fn, NULL);
1128 r = unit_watch_pids_in_path(u, p);
1129 if (r < 0 && ret >= 0)
1133 if (r < 0 && ret >= 0)
1140 int unit_watch_all_pids(Unit *u) {
1143 /* Adds all PIDs from our cgroup to the set of PIDs we
1144 * watch. This is a fallback logic for cases where we do not
1145 * get reliable cgroup empty notifications: we try to use
1146 * SIGCHLD as replacement. */
1148 if (!u->cgroup_path)
1151 if (cg_unified() > 0) /* On unified we can use proper notifications */
1154 return unit_watch_pids_in_path(u, u->cgroup_path);
1157 int unit_notify_cgroup_empty(Unit *u) {
1162 if (!u->cgroup_path)
1165 r = cg_is_empty_recursive(ELOGIND_CGROUP_CONTROLLER, u->cgroup_path);
1169 unit_add_to_gc_queue(u);
1171 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1172 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1177 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1178 Manager *m = userdata;
1185 union inotify_event_buffer buffer;
1186 struct inotify_event *e;
1189 l = read(fd, &buffer, sizeof(buffer));
1191 if (errno == EINTR || errno == EAGAIN)
1194 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1197 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1201 /* Queue overflow has no watch descriptor */
1204 if (e->mask & IN_IGNORED)
1205 /* The watch was just removed */
1208 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1209 if (!u) /* Not that inotify might deliver
1210 * events for a watch even after it
1211 * was removed, because it was queued
1212 * before the removal. Let's ignore
1213 * this here safely. */
1216 (void) unit_notify_cgroup_empty(u);
1222 int manager_setup_cgroup(Manager *m) {
1223 _cleanup_free_ char *path = NULL;
1230 /* 1. Determine hierarchy */
1231 m->cgroup_root = mfree(m->cgroup_root);
1232 r = cg_pid_get_path(ELOGIND_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1234 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1236 /// elogind does not support systemd scopes and slices
1238 /* Chop off the init scope, if we are already located in it */
1239 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1241 /* LEGACY: Also chop off the system slice if we are in
1242 * it. This is to support live upgrades from older systemd
1243 * versions where PID 1 was moved there. Also see
1244 * cg_get_root_path(). */
1246 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1248 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1254 /* And make sure to store away the root value without trailing
1255 * slash, even for the root dir, so that we can easily prepend
1257 while ((e = endswith(m->cgroup_root, "/")))
1261 r = cg_get_path(ELOGIND_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1263 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1265 unified = cg_unified();
1267 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1269 log_debug("Unified cgroup hierarchy is located at %s.", path);
1271 log_debug("Using cgroup controller " ELOGIND_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1274 const char *scope_path;
1276 /* 3. Install agent */
1279 /* In the unified hierarchy we can can get
1280 * cgroup empty notifications via inotify. */
1281 /// elogind does not support the unified hierarchy, yet.
1283 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1284 safe_close(m->cgroup_inotify_fd);
1286 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1287 if (m->cgroup_inotify_fd < 0)
1288 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1290 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1292 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1294 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1296 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1298 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1301 return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1303 } else if (m->running_as == MANAGER_SYSTEM) {
1304 /* On the legacy hierarchy we only get
1305 * notifications via cgroup agents. (Which
1306 * isn't really reliable, since it does not
1307 * generate events when control groups with
1308 * children run empty. */
1310 r = cg_install_release_agent(ELOGIND_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1312 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1314 log_debug("Installed release agent.");
1316 log_debug("Release agent already installed.");
1319 /// elogind is not meant to run in systemd init scope
1321 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1322 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1323 r = cg_create_and_attach(ELOGIND_CGROUP_CONTROLLER, scope_path, 0);
1325 scope_path = strjoina(m->cgroup_root, "/elogind");
1326 r = cg_create_and_attach(ELOGIND_CGROUP_CONTROLLER, scope_path, 0);
1329 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1331 /* also, move all other userspace processes remaining
1332 * in the root cgroup into that scope. */
1333 r = cg_migrate(ELOGIND_CGROUP_CONTROLLER, m->cgroup_root, ELOGIND_CGROUP_CONTROLLER, scope_path, false);
1335 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1337 /* 5. And pin it, so that it cannot be unmounted */
1338 safe_close(m->pin_cgroupfs_fd);
1339 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1340 if (m->pin_cgroupfs_fd < 0)
1341 return log_error_errno(errno, "Failed to open pin file: %m");
1343 /* 6. Always enable hierarchical support if it exists... */
1345 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1348 /* 7. Figure out which controllers are supported */
1349 r = cg_mask_supported(&m->cgroup_supported);
1351 return log_error_errno(r, "Failed to determine supported controllers: %m");
1353 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1354 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1359 void manager_shutdown_cgroup(Manager *m, bool delete) {
1362 /* We can't really delete the group, since we are in it. But
1364 if (delete && m->cgroup_root)
1365 (void) cg_trim(ELOGIND_CGROUP_CONTROLLER, m->cgroup_root, false);
1367 /// elogind does not support the unified hierarchy, yet.
1369 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1371 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1372 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1375 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1377 m->cgroup_root = mfree(m->cgroup_root);
1380 /// UNNEEDED by elogind
1382 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1389 u = hashmap_get(m->cgroup_unit, cgroup);
1393 p = strdupa(cgroup);
1397 e = strrchr(p, '/');
1399 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1403 u = hashmap_get(m->cgroup_unit, p);
1409 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1410 _cleanup_free_ char *cgroup = NULL;
1418 r = cg_pid_get_path(ELOGIND_CGROUP_CONTROLLER, pid, &cgroup);
1422 return manager_get_unit_by_cgroup(m, cgroup);
1425 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1434 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1436 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1440 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1444 return manager_get_unit_by_pid_cgroup(m, pid);
1447 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1453 u = manager_get_unit_by_cgroup(m, cgroup);
1457 return unit_notify_cgroup_empty(u);
1460 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1461 _cleanup_free_ char *v = NULL;
1467 if (!u->cgroup_path)
1470 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1473 if (cg_unified() <= 0)
1474 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1476 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1482 return safe_atou64(v, ret);
1485 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1486 _cleanup_free_ char *v = NULL;
1493 if (!u->cgroup_path)
1496 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1499 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1505 r = safe_atou64(v, &ns);
1513 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1517 r = unit_get_cpu_usage_raw(u, &ns);
1521 if (ns > u->cpuacct_usage_base)
1522 ns -= u->cpuacct_usage_base;
1530 int unit_reset_cpu_usage(Unit *u) {
1536 r = unit_get_cpu_usage_raw(u, &ns);
1538 u->cpuacct_usage_base = 0;
1542 u->cpuacct_usage_base = ns;
1546 bool unit_cgroup_delegate(Unit *u) {
1551 c = unit_get_cgroup_context(u);
1559 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1560 [CGROUP_AUTO] = "auto",
1561 [CGROUP_CLOSED] = "closed",
1562 [CGROUP_STRICT] = "strict",
1565 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);