1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include "cgroup-util.h"
26 #include "path-util.h"
27 #include "process-util.h"
28 //#include "special.h"
32 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
34 // UNNEEDED by elogind
36 void cgroup_context_init(CGroupContext *c) {
39 /* Initialize everything to the kernel defaults, assuming the
40 * structure is preinitialized to 0 */
42 c->cpu_shares = CGROUP_CPU_SHARES_INVALID;
43 c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
44 c->cpu_quota_per_sec_usec = USEC_INFINITY;
46 c->memory_limit = (uint64_t) -1;
48 c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
49 c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID;
51 c->tasks_max = (uint64_t) -1;
53 c->netclass_type = CGROUP_NETCLASS_TYPE_NONE;
56 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
60 LIST_REMOVE(device_allow, c->device_allow, a);
65 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
69 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
74 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
78 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
83 void cgroup_context_done(CGroupContext *c) {
86 while (c->blockio_device_weights)
87 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
89 while (c->blockio_device_bandwidths)
90 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
92 while (c->device_allow)
93 cgroup_context_free_device_allow(c, c->device_allow);
96 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
97 CGroupBlockIODeviceBandwidth *b;
98 CGroupBlockIODeviceWeight *w;
100 char u[FORMAT_TIMESPAN_MAX];
105 prefix = strempty(prefix);
108 "%sCPUAccounting=%s\n"
109 "%sBlockIOAccounting=%s\n"
110 "%sMemoryAccounting=%s\n"
111 "%sTasksAccounting=%s\n"
112 "%sCPUShares=%" PRIu64 "\n"
113 "%sStartupCPUShares=%" PRIu64 "\n"
114 "%sCPUQuotaPerSecSec=%s\n"
115 "%sBlockIOWeight=%" PRIu64 "\n"
116 "%sStartupBlockIOWeight=%" PRIu64 "\n"
117 "%sMemoryLimit=%" PRIu64 "\n"
118 "%sTasksMax=%" PRIu64 "\n"
119 "%sDevicePolicy=%s\n"
121 prefix, yes_no(c->cpu_accounting),
122 prefix, yes_no(c->blockio_accounting),
123 prefix, yes_no(c->memory_accounting),
124 prefix, yes_no(c->tasks_accounting),
125 prefix, c->cpu_shares,
126 prefix, c->startup_cpu_shares,
127 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
128 prefix, c->blockio_weight,
129 prefix, c->startup_blockio_weight,
130 prefix, c->memory_limit,
131 prefix, c->tasks_max,
132 prefix, cgroup_device_policy_to_string(c->device_policy),
133 prefix, yes_no(c->delegate));
135 LIST_FOREACH(device_allow, a, c->device_allow)
137 "%sDeviceAllow=%s %s%s%s\n",
140 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
142 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
144 "%sBlockIODeviceWeight=%s %" PRIu64,
149 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
150 char buf[FORMAT_BYTES_MAX];
155 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
157 format_bytes(buf, sizeof(buf), b->bandwidth));
161 static int lookup_blkio_device(const char *p, dev_t *dev) {
170 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
172 if (S_ISBLK(st.st_mode))
174 else if (major(st.st_dev) != 0) {
175 /* If this is not a device node then find the block
176 * device this file is stored on */
179 /* If this is a partition, try to get the originating
181 block_get_whole_disk(*dev, dev);
183 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
190 static int whitelist_device(const char *path, const char *node, const char *acc) {
191 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
198 if (stat(node, &st) < 0) {
199 log_warning("Couldn't stat device %s", node);
203 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
204 log_warning("%s is not a device.", node);
210 S_ISCHR(st.st_mode) ? 'c' : 'b',
211 major(st.st_rdev), minor(st.st_rdev),
214 r = cg_set_attribute("devices", path, "devices.allow", buf);
216 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
217 "Failed to set devices.allow on %s: %m", path);
222 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
223 _cleanup_fclose_ FILE *f = NULL;
230 assert(type == 'b' || type == 'c');
232 f = fopen("/proc/devices", "re");
234 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
236 FOREACH_LINE(line, f, goto fail) {
237 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
242 if (type == 'c' && streq(line, "Character devices:")) {
247 if (type == 'b' && streq(line, "Block devices:")) {
262 w = strpbrk(p, WHITESPACE);
267 r = safe_atou(p, &maj);
274 w += strspn(w, WHITESPACE);
276 if (fnmatch(name, w, 0) != 0)
285 r = cg_set_attribute("devices", path, "devices.allow", buf);
287 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
288 "Failed to set devices.allow on %s: %m", path);
294 log_warning_errno(errno, "Failed to read /proc/devices: %m");
298 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, uint32_t netclass, ManagerState state) {
308 /* Some cgroup attributes are not supported on the root cgroup,
309 * hence silently ignore */
310 is_root = isempty(path) || path_equal(path, "/");
312 /* Make sure we don't try to display messages with an empty path. */
315 /* We generally ignore errors caused by read-only mounted
316 * cgroup trees (assuming we are running in a container then),
317 * and missing cgroups, i.e. EROFS and ENOENT. */
319 if ((mask & CGROUP_MASK_CPU) && !is_root) {
320 char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1];
322 sprintf(buf, "%" PRIu64 "\n",
323 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->startup_cpu_shares :
324 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ? c->cpu_shares : CGROUP_CPU_SHARES_DEFAULT);
325 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
327 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
328 "Failed to set cpu.shares on %s: %m", path);
330 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
331 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
333 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
334 "Failed to set cpu.cfs_period_us on %s: %m", path);
336 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
337 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
338 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
340 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
342 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
343 "Failed to set cpu.cfs_quota_us on %s: %m", path);
346 if (mask & CGROUP_MASK_BLKIO) {
347 char buf[MAX(DECIMAL_STR_MAX(uint64_t)+1,
348 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
349 CGroupBlockIODeviceWeight *w;
350 CGroupBlockIODeviceBandwidth *b;
353 sprintf(buf, "%" PRIu64 "\n",
354 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->startup_blockio_weight :
355 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ? c->blockio_weight : CGROUP_BLKIO_WEIGHT_DEFAULT);
356 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
358 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
359 "Failed to set blkio.weight on %s: %m", path);
361 /* FIXME: no way to reset this list */
362 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
365 r = lookup_blkio_device(w->path, &dev);
369 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), w->weight);
370 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
372 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
373 "Failed to set blkio.weight_device on %s: %m", path);
377 /* FIXME: no way to reset this list */
378 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
382 r = lookup_blkio_device(b->path, &dev);
386 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
388 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
389 r = cg_set_attribute("blkio", path, a, buf);
391 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
392 "Failed to set %s on %s: %m", a, path);
396 if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
397 if (c->memory_limit != (uint64_t) -1) {
398 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
400 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
402 if (cg_unified() <= 0)
403 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
405 r = cg_set_attribute("memory", path, "memory.max", buf);
408 if (cg_unified() <= 0)
409 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
411 r = cg_set_attribute("memory", path, "memory.max", "max");
415 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
416 "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
419 if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
420 CGroupDeviceAllow *a;
422 /* Changing the devices list of a populated cgroup
423 * might result in EINVAL, hence ignore EINVAL
426 if (c->device_allow || c->device_policy != CGROUP_AUTO)
427 r = cg_set_attribute("devices", path, "devices.deny", "a");
429 r = cg_set_attribute("devices", path, "devices.allow", "a");
431 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
432 "Failed to reset devices.list on %s: %m", path);
434 if (c->device_policy == CGROUP_CLOSED ||
435 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
436 static const char auto_devices[] =
437 "/dev/null\0" "rwm\0"
438 "/dev/zero\0" "rwm\0"
439 "/dev/full\0" "rwm\0"
440 "/dev/random\0" "rwm\0"
441 "/dev/urandom\0" "rwm\0"
443 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
447 NULSTR_FOREACH_PAIR(x, y, auto_devices)
448 whitelist_device(path, x, y);
450 whitelist_major(path, "pts", 'c', "rw");
451 whitelist_major(path, "kdbus", 'c', "rw");
452 whitelist_major(path, "kdbus/*", 'c', "rw");
455 LIST_FOREACH(device_allow, a, c->device_allow) {
471 if (startswith(a->path, "/dev/"))
472 whitelist_device(path, a->path, acc);
473 else if (startswith(a->path, "block-"))
474 whitelist_major(path, a->path + 6, 'b', acc);
475 else if (startswith(a->path, "char-"))
476 whitelist_major(path, a->path + 5, 'c', acc);
478 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
482 if ((mask & CGROUP_MASK_PIDS) && !is_root) {
484 if (c->tasks_max != (uint64_t) -1) {
485 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
487 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
488 r = cg_set_attribute("pids", path, "pids.max", buf);
490 r = cg_set_attribute("pids", path, "pids.max", "max");
493 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
494 "Failed to set pids.max on %s: %m", path);
497 if (mask & CGROUP_MASK_NET_CLS) {
498 char buf[DECIMAL_STR_MAX(uint32_t)];
500 sprintf(buf, "%" PRIu32, netclass);
502 r = cg_set_attribute("net_cls", path, "net_cls.classid", buf);
504 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
505 "Failed to set net_cls.classid on %s: %m", path);
509 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
512 /* Figure out which controllers we need */
514 if (c->cpu_accounting ||
515 c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
516 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
517 c->cpu_quota_per_sec_usec != USEC_INFINITY)
518 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
520 if (c->blockio_accounting ||
521 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
522 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
523 c->blockio_device_weights ||
524 c->blockio_device_bandwidths)
525 mask |= CGROUP_MASK_BLKIO;
527 if (c->memory_accounting ||
528 c->memory_limit != (uint64_t) -1)
529 mask |= CGROUP_MASK_MEMORY;
531 if (c->device_allow ||
532 c->device_policy != CGROUP_AUTO)
533 mask |= CGROUP_MASK_DEVICES;
535 if (c->tasks_accounting ||
536 c->tasks_max != (uint64_t) -1)
537 mask |= CGROUP_MASK_PIDS;
539 if (c->netclass_type != CGROUP_NETCLASS_TYPE_NONE)
540 mask |= CGROUP_MASK_NET_CLS;
545 CGroupMask unit_get_own_mask(Unit *u) {
548 /* Returns the mask of controllers the unit needs for itself */
550 c = unit_get_cgroup_context(u);
554 /* If delegation is turned on, then turn on all cgroups,
555 * unless we are on the legacy hierarchy and the process we
556 * fork into it is known to drop privileges, and hence
557 * shouldn't get access to the controllers.
559 * Note that on the unified hierarchy it is safe to delegate
560 * controllers to unprivileged services. */
565 e = unit_get_exec_context(u);
567 exec_context_maintains_privileges(e) ||
569 return _CGROUP_MASK_ALL;
572 return cgroup_context_get_mask(c);
575 CGroupMask unit_get_members_mask(Unit *u) {
578 /* Returns the mask of controllers all of the unit's children
581 if (u->cgroup_members_mask_valid)
582 return u->cgroup_members_mask;
584 u->cgroup_members_mask = 0;
586 if (u->type == UNIT_SLICE) {
590 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
595 if (UNIT_DEREF(member->slice) != u)
598 u->cgroup_members_mask |=
599 unit_get_own_mask(member) |
600 unit_get_members_mask(member);
604 u->cgroup_members_mask_valid = true;
605 return u->cgroup_members_mask;
608 CGroupMask unit_get_siblings_mask(Unit *u) {
611 /* Returns the mask of controllers all of the unit's siblings
612 * require, i.e. the members mask of the unit's parent slice
613 * if there is one. */
615 if (UNIT_ISSET(u->slice))
616 return unit_get_members_mask(UNIT_DEREF(u->slice));
618 return unit_get_own_mask(u) | unit_get_members_mask(u);
621 CGroupMask unit_get_subtree_mask(Unit *u) {
623 /* Returns the mask of this subtree, meaning of the group
624 * itself and its children. */
626 return unit_get_own_mask(u) | unit_get_members_mask(u);
629 CGroupMask unit_get_target_mask(Unit *u) {
632 /* This returns the cgroup mask of all controllers to enable
633 * for a specific cgroup, i.e. everything it needs itself,
634 * plus all that its children need, plus all that its siblings
635 * need. This is primarily useful on the legacy cgroup
636 * hierarchy, where we need to duplicate each cgroup in each
637 * hierarchy that shall be enabled for it. */
639 mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
640 mask &= u->manager->cgroup_supported;
645 CGroupMask unit_get_enable_mask(Unit *u) {
648 /* This returns the cgroup mask of all controllers to enable
649 * for the children of a specific cgroup. This is primarily
650 * useful for the unified cgroup hierarchy, where each cgroup
651 * controls which controllers are enabled for its children. */
653 mask = unit_get_members_mask(u);
654 mask &= u->manager->cgroup_supported;
659 /* Recurse from a unit up through its containing slices, propagating
660 * mask bits upward. A unit is also member of itself. */
661 void unit_update_cgroup_members_masks(Unit *u) {
667 /* Calculate subtree mask */
668 m = unit_get_subtree_mask(u);
670 /* See if anything changed from the previous invocation. If
671 * not, we're done. */
672 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
676 u->cgroup_subtree_mask_valid &&
677 ((m & ~u->cgroup_subtree_mask) != 0) &&
678 ((~m & u->cgroup_subtree_mask) == 0);
680 u->cgroup_subtree_mask = m;
681 u->cgroup_subtree_mask_valid = true;
683 if (UNIT_ISSET(u->slice)) {
684 Unit *s = UNIT_DEREF(u->slice);
687 /* There's more set now than before. We
688 * propagate the new mask to the parent's mask
689 * (not caring if it actually was valid or
692 s->cgroup_members_mask |= m;
695 /* There's less set now than before (or we
696 * don't know), we need to recalculate
697 * everything, so let's invalidate the
698 * parent's members mask */
700 s->cgroup_members_mask_valid = false;
702 /* And now make sure that this change also hits our
704 unit_update_cgroup_members_masks(s);
708 static const char *migrate_callback(CGroupMask mask, void *userdata) {
715 if (u->cgroup_path &&
716 u->cgroup_realized &&
717 (u->cgroup_realized_mask & mask) == mask)
718 return u->cgroup_path;
720 u = UNIT_DEREF(u->slice);
726 char *unit_default_cgroup_path(Unit *u) {
727 _cleanup_free_ char *escaped = NULL, *slice = NULL;
732 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
733 return strdup(u->manager->cgroup_root);
735 if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
736 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
741 escaped = cg_escape(u->id);
746 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
748 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
751 int unit_set_cgroup_path(Unit *u, const char *path) {
752 _cleanup_free_ char *p = NULL;
764 if (streq_ptr(u->cgroup_path, p))
768 r = hashmap_put(u->manager->cgroup_unit, p, u);
773 unit_release_cgroup(u);
781 int unit_watch_cgroup(Unit *u) {
782 _cleanup_free_ char *populated = NULL;
790 if (u->cgroup_inotify_wd >= 0)
793 /* Only applies to the unified hierarchy */
796 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
800 /* Don't watch the root slice, it's pointless. */
801 if (unit_has_name(u, SPECIAL_ROOT_SLICE))
804 r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
808 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
812 u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
813 if (u->cgroup_inotify_wd < 0) {
815 if (errno == ENOENT) /* If the directory is already
816 * gone we don't need to track
817 * it, so this is not an error */
820 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
823 r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
825 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
830 static int unit_create_cgroup(
832 CGroupMask target_mask,
833 CGroupMask enable_mask) {
840 c = unit_get_cgroup_context(u);
844 if (!u->cgroup_path) {
845 _cleanup_free_ char *path = NULL;
847 path = unit_default_cgroup_path(u);
851 r = unit_set_cgroup_path(u, path);
853 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
855 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
858 /* First, create our own group */
859 r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
861 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
863 /* Start watching it */
864 (void) unit_watch_cgroup(u);
866 /* Enable all controllers we need */
867 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
869 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
871 /* Keep track that this is now realized */
872 u->cgroup_realized = true;
873 u->cgroup_realized_mask = target_mask;
875 if (u->type != UNIT_SLICE && !c->delegate) {
877 /* Then, possibly move things over, but not if
878 * subgroups may contain processes, which is the case
879 * for slice and delegation units. */
880 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
882 log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
888 int unit_attach_pids_to_cgroup(Unit *u) {
892 r = unit_realize_cgroup(u);
896 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
903 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
906 return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
909 static int unit_find_free_netclass_cgroup(Unit *u, uint32_t *ret) {
918 i = start = m->cgroup_netclass_registry_last;
923 if (!hashmap_get(m->cgroup_netclass_registry, UINT_TO_PTR(i))) {
924 m->cgroup_netclass_registry_last = i;
930 i = CGROUP_NETCLASS_FIXED_MAX;
932 } while (i != start);
937 int unit_add_to_netclass_cgroup(Unit *u) {
946 cc = unit_get_cgroup_context(u);
950 switch (cc->netclass_type) {
951 case CGROUP_NETCLASS_TYPE_NONE:
954 case CGROUP_NETCLASS_TYPE_FIXED:
955 u->cgroup_netclass_id = cc->netclass_id;
958 case CGROUP_NETCLASS_TYPE_AUTO:
959 /* Allocate a new ID in case it was requested and not done yet */
960 if (u->cgroup_netclass_id == 0) {
961 r = unit_find_free_netclass_cgroup(u, &u->cgroup_netclass_id);
965 log_debug("Dynamically assigned netclass cgroup id %" PRIu32 " to %s", u->cgroup_netclass_id, u->id);
971 r = hashmap_ensure_allocated(&u->manager->cgroup_netclass_registry, &trivial_hash_ops);
975 key = UINT32_TO_PTR(u->cgroup_netclass_id);
976 first = hashmap_get(u->manager->cgroup_netclass_registry, key);
979 LIST_PREPEND(cgroup_netclass, first, u);
980 return hashmap_replace(u->manager->cgroup_netclass_registry, key, u);
983 return hashmap_put(u->manager->cgroup_netclass_registry, key, u);
986 int unit_remove_from_netclass_cgroup(Unit *u) {
993 key = UINT32_TO_PTR(u->cgroup_netclass_id);
995 LIST_FIND_HEAD(cgroup_netclass, u, head);
996 LIST_REMOVE(cgroup_netclass, head, u);
999 return hashmap_replace(u->manager->cgroup_netclass_registry, key, head);
1001 hashmap_remove(u->manager->cgroup_netclass_registry, key);
1006 /* Check if necessary controllers and attributes for a unit are in place.
1008 * If so, do nothing.
1009 * If not, create paths, move processes over, and set attributes.
1011 * Returns 0 on success and < 0 on failure. */
1012 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
1013 CGroupMask target_mask, enable_mask;
1018 if (u->in_cgroup_queue) {
1019 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
1020 u->in_cgroup_queue = false;
1023 target_mask = unit_get_target_mask(u);
1024 if (unit_has_mask_realized(u, target_mask))
1027 /* First, realize parents */
1028 if (UNIT_ISSET(u->slice)) {
1029 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
1034 /* And then do the real work */
1035 enable_mask = unit_get_enable_mask(u);
1036 r = unit_create_cgroup(u, target_mask, enable_mask);
1040 /* Finally, apply the necessary attributes. */
1041 cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, u->cgroup_netclass_id, state);
1046 static void unit_add_to_cgroup_queue(Unit *u) {
1048 if (u->in_cgroup_queue)
1051 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
1052 u->in_cgroup_queue = true;
1055 unsigned manager_dispatch_cgroup_queue(Manager *m) {
1061 state = manager_state(m);
1063 while ((i = m->cgroup_queue)) {
1064 assert(i->in_cgroup_queue);
1066 r = unit_realize_cgroup_now(i, state);
1068 log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
1076 static void unit_queue_siblings(Unit *u) {
1079 /* This adds the siblings of the specified unit and the
1080 * siblings of all parent units to the cgroup queue. (But
1081 * neither the specified unit itself nor the parents.) */
1083 while ((slice = UNIT_DEREF(u->slice))) {
1087 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
1091 /* Skip units that have a dependency on the slice
1092 * but aren't actually in it. */
1093 if (UNIT_DEREF(m->slice) != slice)
1096 /* No point in doing cgroup application for units
1097 * without active processes. */
1098 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
1101 /* If the unit doesn't need any new controllers
1102 * and has current ones realized, it doesn't need
1104 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
1107 unit_add_to_cgroup_queue(m);
1114 int unit_realize_cgroup(Unit *u) {
1117 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1120 /* So, here's the deal: when realizing the cgroups for this
1121 * unit, we need to first create all parents, but there's more
1122 * actually: for the weight-based controllers we also need to
1123 * make sure that all our siblings (i.e. units that are in the
1124 * same slice as we are) have cgroups, too. Otherwise, things
1125 * would become very uneven as each of their processes would
1126 * get as much resources as all our group together. This call
1127 * will synchronously create the parent cgroups, but will
1128 * defer work on the siblings to the next event loop
1131 /* Add all sibling slices to the cgroup queue. */
1132 unit_queue_siblings(u);
1134 /* And realize this one now (and apply the values) */
1135 return unit_realize_cgroup_now(u, manager_state(u->manager));
1138 void unit_release_cgroup(Unit *u) {
1141 /* Forgets all cgroup details for this cgroup */
1143 if (u->cgroup_path) {
1144 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1145 u->cgroup_path = mfree(u->cgroup_path);
1148 if (u->cgroup_inotify_wd >= 0) {
1149 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1150 log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1152 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1153 u->cgroup_inotify_wd = -1;
1157 void unit_prune_cgroup(Unit *u) {
1163 /* Removes the cgroup, if empty and possible, and stops watching it. */
1165 if (!u->cgroup_path)
1168 is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1170 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1172 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1179 unit_release_cgroup(u);
1181 u->cgroup_realized = false;
1182 u->cgroup_realized_mask = 0;
1185 int unit_search_main_pid(Unit *u, pid_t *ret) {
1186 _cleanup_fclose_ FILE *f = NULL;
1187 pid_t pid = 0, npid, mypid;
1193 if (!u->cgroup_path)
1196 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1201 while (cg_read_pid(f, &npid) > 0) {
1207 /* Ignore processes that aren't our kids */
1208 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1212 /* Dang, there's more than one daemonized PID
1213 in this group, so we don't know what process
1214 is the main process. */
1225 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1226 _cleanup_closedir_ DIR *d = NULL;
1227 _cleanup_fclose_ FILE *f = NULL;
1233 r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1239 while ((r = cg_read_pid(f, &pid)) > 0) {
1240 r = unit_watch_pid(u, pid);
1241 if (r < 0 && ret >= 0)
1245 if (r < 0 && ret >= 0)
1249 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1256 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1257 _cleanup_free_ char *p = NULL;
1259 p = strjoin(path, "/", fn, NULL);
1265 r = unit_watch_pids_in_path(u, p);
1266 if (r < 0 && ret >= 0)
1270 if (r < 0 && ret >= 0)
1277 int unit_watch_all_pids(Unit *u) {
1280 /* Adds all PIDs from our cgroup to the set of PIDs we
1281 * watch. This is a fallback logic for cases where we do not
1282 * get reliable cgroup empty notifications: we try to use
1283 * SIGCHLD as replacement. */
1285 if (!u->cgroup_path)
1288 if (cg_unified() > 0) /* On unified we can use proper notifications */
1291 return unit_watch_pids_in_path(u, u->cgroup_path);
1294 int unit_notify_cgroup_empty(Unit *u) {
1299 if (!u->cgroup_path)
1302 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1306 unit_add_to_gc_queue(u);
1308 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1309 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1314 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1315 Manager *m = userdata;
1322 union inotify_event_buffer buffer;
1323 struct inotify_event *e;
1326 l = read(fd, &buffer, sizeof(buffer));
1328 if (errno == EINTR || errno == EAGAIN)
1331 return log_error_errno(errno, "Failed to read control group inotify events: %m");
1334 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1338 /* Queue overflow has no watch descriptor */
1341 if (e->mask & IN_IGNORED)
1342 /* The watch was just removed */
1345 u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1346 if (!u) /* Not that inotify might deliver
1347 * events for a watch even after it
1348 * was removed, because it was queued
1349 * before the removal. Let's ignore
1350 * this here safely. */
1353 (void) unit_notify_cgroup_empty(u);
1359 int manager_setup_cgroup(Manager *m) {
1360 _cleanup_free_ char *path = NULL;
1367 /* 1. Determine hierarchy */
1368 m->cgroup_root = mfree(m->cgroup_root);
1369 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1371 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1373 /// elogind does not support systemd scopes and slices
1375 /* Chop off the init scope, if we are already located in it */
1376 e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1378 /* LEGACY: Also chop off the system slice if we are in
1379 * it. This is to support live upgrades from older systemd
1380 * versions where PID 1 was moved there. Also see
1381 * cg_get_root_path(). */
1382 if (!e && m->running_as == MANAGER_SYSTEM) {
1383 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1385 e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1391 /* And make sure to store away the root value without trailing
1392 * slash, even for the root dir, so that we can easily prepend
1394 while ((e = endswith(m->cgroup_root, "/")))
1396 log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1397 SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1400 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1402 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1404 unified = cg_unified();
1406 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1408 log_debug("Unified cgroup hierarchy is located at %s.", path);
1410 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1413 const char *scope_path;
1415 /* 3. Install agent */
1418 /* In the unified hierarchy we can can get
1419 * cgroup empty notifications via inotify. */
1421 /// elogind does not support the unified hierarchy, yet.
1423 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1424 safe_close(m->cgroup_inotify_fd);
1426 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1427 if (m->cgroup_inotify_fd < 0)
1428 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1430 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1432 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1434 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1436 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1438 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1441 return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1443 } else if (m->running_as == MANAGER_SYSTEM) {
1445 /* On the legacy hierarchy we only get
1446 * notifications via cgroup agents. (Which
1447 * isn't really reliable, since it does not
1448 * generate events when control groups with
1449 * children run empty. */
1451 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1453 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1455 log_debug("Installed release agent.");
1457 log_debug("Release agent already installed.");
1460 /// elogind is not meant to run in systemd init scope
1462 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1463 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1464 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1466 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1467 // we are our own cgroup controller
1468 scope_path = strjoina("");
1469 else if (streq(m->cgroup_root, "/elogind"))
1470 // root already is our cgroup
1471 scope_path = strjoina(m->cgroup_root);
1473 // we have to create our own group
1474 scope_path = strjoina(m->cgroup_root, "/elogind");
1475 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1478 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1479 log_debug_elogind("Created control group \"%s\"", scope_path);
1481 /* also, move all other userspace processes remaining
1482 * in the root cgroup into that scope. */
1483 if (!streq(m->cgroup_root, scope_path)) {
1484 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1486 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1489 /* 5. And pin it, so that it cannot be unmounted */
1490 safe_close(m->pin_cgroupfs_fd);
1491 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1492 if (m->pin_cgroupfs_fd < 0)
1493 return log_error_errno(errno, "Failed to open pin file: %m");
1495 /* 6. Always enable hierarchical support if it exists... */
1497 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1500 /* 7. Figure out which controllers are supported */
1501 r = cg_mask_supported(&m->cgroup_supported);
1503 return log_error_errno(r, "Failed to determine supported controllers: %m");
1505 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1506 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1511 void manager_shutdown_cgroup(Manager *m, bool delete) {
1514 /* We can't really delete the group, since we are in it. But
1516 if (delete && m->cgroup_root)
1517 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1519 /// elogind does not support the unified hierarchy, yet.
1521 m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1523 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1524 m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1527 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1529 m->cgroup_root = mfree(m->cgroup_root);
1532 /// UNNEEDED by elogind
1534 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1541 u = hashmap_get(m->cgroup_unit, cgroup);
1545 p = strdupa(cgroup);
1549 e = strrchr(p, '/');
1551 return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1555 u = hashmap_get(m->cgroup_unit, p);
1561 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1562 _cleanup_free_ char *cgroup = NULL;
1570 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1574 return manager_get_unit_by_cgroup(m, cgroup);
1577 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1586 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1588 u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1592 u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1596 return manager_get_unit_by_pid_cgroup(m, pid);
1599 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1605 u = manager_get_unit_by_cgroup(m, cgroup);
1609 return unit_notify_cgroup_empty(u);
1612 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1613 _cleanup_free_ char *v = NULL;
1619 if (!u->cgroup_path)
1622 if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1625 if (cg_unified() <= 0)
1626 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1628 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1634 return safe_atou64(v, ret);
1637 int unit_get_tasks_current(Unit *u, uint64_t *ret) {
1638 _cleanup_free_ char *v = NULL;
1644 if (!u->cgroup_path)
1647 if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
1650 r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
1656 return safe_atou64(v, ret);
1659 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1660 _cleanup_free_ char *v = NULL;
1667 if (!u->cgroup_path)
1670 if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1673 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1679 r = safe_atou64(v, &ns);
1687 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1691 r = unit_get_cpu_usage_raw(u, &ns);
1695 if (ns > u->cpuacct_usage_base)
1696 ns -= u->cpuacct_usage_base;
1704 int unit_reset_cpu_usage(Unit *u) {
1710 r = unit_get_cpu_usage_raw(u, &ns);
1712 u->cpuacct_usage_base = 0;
1716 u->cpuacct_usage_base = ns;
1720 bool unit_cgroup_delegate(Unit *u) {
1725 c = unit_get_cgroup_context(u);
1732 void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
1735 if (!UNIT_HAS_CGROUP_CONTEXT(u))
1741 if ((u->cgroup_realized_mask & m) == 0)
1744 u->cgroup_realized_mask &= ~m;
1745 unit_add_to_cgroup_queue(u);
1748 void manager_invalidate_startup_units(Manager *m) {
1754 SET_FOREACH(u, m->startup_units, i)
1755 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_BLKIO);
1758 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1759 [CGROUP_AUTO] = "auto",
1760 [CGROUP_CLOSED] = "closed",
1761 [CGROUP_STRICT] = "strict",
1764 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);