1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "path-util.h"
26 #include "cgroup-util.h"
29 void cgroup_context_init(CGroupContext *c) {
32 /* Initialize everything to the kernel defaults, assuming the
33 * structure is preinitialized to 0 */
36 c->memory_limit = (uint64_t) -1;
37 c->blockio_weight = 1000;
40 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
44 LIST_REMOVE(device_allow, c->device_allow, a);
49 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
53 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
58 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
62 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
67 void cgroup_context_done(CGroupContext *c) {
70 while (c->blockio_device_weights)
71 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
73 while (c->blockio_device_bandwidths)
74 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
76 while (c->device_allow)
77 cgroup_context_free_device_allow(c, c->device_allow);
80 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
81 CGroupBlockIODeviceBandwidth *b;
82 CGroupBlockIODeviceWeight *w;
88 prefix = strempty(prefix);
91 "%sCPUAccounting=%s\n"
92 "%sBlockIOAccounting=%s\n"
93 "%sMemoryAccounting=%s\n"
95 "%sBlockIOWeight=%lu\n"
96 "%sMemoryLimit=%" PRIu64 "\n"
97 "%sDevicePolicy=%s\n",
98 prefix, yes_no(c->cpu_accounting),
99 prefix, yes_no(c->blockio_accounting),
100 prefix, yes_no(c->memory_accounting),
101 prefix, c->cpu_shares,
102 prefix, c->blockio_weight,
103 prefix, c->memory_limit,
104 prefix, cgroup_device_policy_to_string(c->device_policy));
106 LIST_FOREACH(device_allow, a, c->device_allow)
108 "%sDeviceAllow=%s %s%s%s\n",
111 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
113 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
115 "%sBlockIODeviceWeight=%s %lu",
120 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
121 char buf[FORMAT_BYTES_MAX];
126 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
128 format_bytes(buf, sizeof(buf), b->bandwidth));
132 static int lookup_blkio_device(const char *p, dev_t *dev) {
141 log_warning("Couldn't stat device %s: %m", p);
145 if (S_ISBLK(st.st_mode))
147 else if (major(st.st_dev) != 0) {
148 /* If this is not a device node then find the block
149 * device this file is stored on */
152 /* If this is a partition, try to get the originating
154 block_get_whole_disk(*dev, dev);
156 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
163 static int whitelist_device(const char *path, const char *node, const char *acc) {
164 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
171 if (stat(node, &st) < 0) {
172 log_warning("Couldn't stat device %s", node);
176 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
177 log_warning("%s is not a device.", node);
183 S_ISCHR(st.st_mode) ? 'c' : 'b',
184 major(st.st_rdev), minor(st.st_rdev),
187 r = cg_set_attribute("devices", path, "devices.allow", buf);
189 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
194 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
195 _cleanup_fclose_ FILE *f = NULL;
202 assert(type == 'b' || type == 'c');
204 f = fopen("/proc/devices", "re");
206 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
210 FOREACH_LINE(line, f, goto fail) {
211 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
216 if (type == 'c' && streq(line, "Character devices:")) {
221 if (type == 'b' && streq(line, "Block devices:")) {
236 w = strpbrk(p, WHITESPACE);
241 r = safe_atou(p, &maj);
248 w += strspn(w, WHITESPACE);
258 r = cg_set_attribute("devices", path, "devices.allow", buf);
260 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
266 log_warning("Failed to read /proc/devices: %m");
270 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
280 /* Some cgroup attributes are not support on the root cgroup,
281 * hence silently ignore */
282 is_root = isempty(path) || path_equal(path, "/");
284 if ((mask & CGROUP_CPU) && !is_root) {
285 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
287 sprintf(buf, "%lu\n", c->cpu_shares);
288 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
290 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
293 if (mask & CGROUP_BLKIO) {
294 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
295 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
296 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
297 CGroupBlockIODeviceWeight *w;
298 CGroupBlockIODeviceBandwidth *b;
301 sprintf(buf, "%lu\n", c->blockio_weight);
302 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
304 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
306 /* FIXME: no way to reset this list */
307 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
310 r = lookup_blkio_device(w->path, &dev);
314 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
315 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
317 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
321 /* FIXME: no way to reset this list */
322 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
326 r = lookup_blkio_device(b->path, &dev);
330 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
332 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
333 r = cg_set_attribute("blkio", path, a, buf);
335 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
339 if (mask & CGROUP_MEMORY) {
340 if (c->memory_limit != (uint64_t) -1) {
341 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
343 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
344 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
346 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
349 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
352 if ((mask & CGROUP_DEVICE) && !is_root) {
353 CGroupDeviceAllow *a;
355 if (c->device_allow || c->device_policy != CGROUP_AUTO)
356 r = cg_set_attribute("devices", path, "devices.deny", "a");
358 r = cg_set_attribute("devices", path, "devices.allow", "a");
360 log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
362 if (c->device_policy == CGROUP_CLOSED ||
363 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
364 static const char auto_devices[] =
368 "/dev/random\0" "rw\0"
369 "/dev/urandom\0" "rw\0";
373 NULSTR_FOREACH_PAIR(x, y, auto_devices)
374 whitelist_device(path, x, y);
377 LIST_FOREACH(device_allow, a, c->device_allow) {
393 if (startswith(a->path, "/dev/"))
394 whitelist_device(path, a->path, acc);
395 else if (startswith(a->path, "block-"))
396 whitelist_major(path, a->path + 6, 'b', acc);
397 else if (startswith(a->path, "char-"))
398 whitelist_major(path, a->path + 5, 'c', acc);
400 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
405 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
406 CGroupControllerMask mask = 0;
408 /* Figure out which controllers we need */
410 if (c->cpu_accounting || c->cpu_shares != 1024)
411 mask |= CGROUP_CPUACCT | CGROUP_CPU;
413 if (c->blockio_accounting ||
414 c->blockio_weight != 1000 ||
415 c->blockio_device_weights ||
416 c->blockio_device_bandwidths)
417 mask |= CGROUP_BLKIO;
419 if (c->memory_accounting ||
420 c->memory_limit != (uint64_t) -1)
421 mask |= CGROUP_MEMORY;
423 if (c->device_allow || c->device_policy != CGROUP_AUTO)
424 mask |= CGROUP_DEVICE;
429 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
432 c = unit_get_cgroup_context(u);
436 return cgroup_context_get_mask(c);
439 CGroupControllerMask unit_get_members_mask(Unit *u) {
442 if (u->cgroup_members_mask_valid)
443 return u->cgroup_members_mask;
445 u->cgroup_members_mask = 0;
447 if (u->type == UNIT_SLICE) {
451 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
456 if (UNIT_DEREF(member->slice) != u)
459 u->cgroup_members_mask |=
460 unit_get_cgroup_mask(member) |
461 unit_get_members_mask(member);
465 u->cgroup_members_mask_valid = true;
466 return u->cgroup_members_mask;
469 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
470 CGroupControllerMask m;
474 if (UNIT_ISSET(u->slice))
475 m = unit_get_members_mask(UNIT_DEREF(u->slice));
477 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
479 /* Sibling propagation is only relevant for weight-based
480 * controllers, so let's mask out everything else */
481 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
484 CGroupControllerMask unit_get_target_mask(Unit *u) {
485 CGroupControllerMask mask;
487 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
488 mask &= u->manager->cgroup_supported;
493 /* Recurse from a unit up through its containing slices, propagating
494 * mask bits upward. A unit is also member of itself. */
495 void unit_update_cgroup_members_masks(Unit *u) {
496 CGroupControllerMask m;
501 /* Calculate subtree mask */
502 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
504 /* See if anything changed from the previous invocation. If
505 * not, we're done. */
506 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
510 u->cgroup_subtree_mask_valid &&
511 ((m & ~u->cgroup_subtree_mask) != 0) &&
512 ((~m & u->cgroup_subtree_mask) == 0);
514 u->cgroup_subtree_mask = m;
515 u->cgroup_subtree_mask_valid = true;
517 if (UNIT_ISSET(u->slice)) {
518 Unit *s = UNIT_DEREF(u->slice);
521 /* There's more set now than before. We
522 * propagate the new mask to the parent's mask
523 * (not caring if it actually was valid or
526 s->cgroup_members_mask |= m;
529 /* There's less set now than before (or we
530 * don't know), we need to recalculate
531 * everything, so let's invalidate the
532 * parent's members mask */
534 s->cgroup_members_mask_valid = false;
536 /* And now make sure that this change also hits our
538 unit_update_cgroup_members_masks(s);
542 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
549 if (u->cgroup_path &&
550 u->cgroup_realized &&
551 (u->cgroup_realized_mask & mask) == mask)
552 return u->cgroup_path;
554 u = UNIT_DEREF(u->slice);
560 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
561 _cleanup_free_ char *path = NULL;
566 path = unit_default_cgroup_path(u);
570 r = hashmap_put(u->manager->cgroup_unit, path, u);
572 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
576 u->cgroup_path = path;
580 /* First, create our own group */
581 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
583 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
587 /* Keep track that this is now realized */
588 u->cgroup_realized = true;
589 u->cgroup_realized_mask = mask;
591 /* Then, possibly move things over */
592 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
594 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
599 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
602 return u->cgroup_realized && u->cgroup_realized_mask == mask;
605 /* Check if necessary controllers and attributes for a unit are in place.
608 * If not, create paths, move processes over, and set attributes.
610 * Returns 0 on success and < 0 on failure. */
611 static int unit_realize_cgroup_now(Unit *u) {
612 CGroupControllerMask mask;
617 if (u->in_cgroup_queue) {
618 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
619 u->in_cgroup_queue = false;
622 mask = unit_get_target_mask(u);
624 if (unit_has_mask_realized(u, mask))
627 /* First, realize parents */
628 if (UNIT_ISSET(u->slice)) {
629 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
634 /* And then do the real work */
635 r = unit_create_cgroups(u, mask);
639 /* Finally, apply the necessary attributes. */
640 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
645 static void unit_add_to_cgroup_queue(Unit *u) {
647 if (u->in_cgroup_queue)
650 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
651 u->in_cgroup_queue = true;
654 unsigned manager_dispatch_cgroup_queue(Manager *m) {
659 while ((i = m->cgroup_queue)) {
660 assert(i->in_cgroup_queue);
662 r = unit_realize_cgroup_now(i);
664 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
672 static void unit_queue_siblings(Unit *u) {
675 /* This adds the siblings of the specified unit and the
676 * siblings of all parent units to the cgroup queue. (But
677 * neither the specified unit itself nor the parents.) */
679 while ((slice = UNIT_DEREF(u->slice))) {
683 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
687 /* Skip units that have a dependency on the slice
688 * but aren't actually in it. */
689 if (UNIT_DEREF(m->slice) != slice)
692 /* No point in doing cgroup application for units
693 * without active processes. */
694 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
697 /* If the unit doesn't need any new controllers
698 * and has current ones realized, it doesn't need
700 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
703 unit_add_to_cgroup_queue(m);
710 int unit_realize_cgroup(Unit *u) {
715 c = unit_get_cgroup_context(u);
719 /* So, here's the deal: when realizing the cgroups for this
720 * unit, we need to first create all parents, but there's more
721 * actually: for the weight-based controllers we also need to
722 * make sure that all our siblings (i.e. units that are in the
723 * same slice as we are) have cgroups, too. Otherwise, things
724 * would become very uneven as each of their processes would
725 * get as much resources as all our group together. This call
726 * will synchronously create the parent cgroups, but will
727 * defer work on the siblings to the next event loop
730 /* Add all sibling slices to the cgroup queue. */
731 unit_queue_siblings(u);
733 /* And realize this one now (and apply the values) */
734 return unit_realize_cgroup_now(u);
737 void unit_destroy_cgroup(Unit *u) {
745 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
747 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
749 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
751 free(u->cgroup_path);
752 u->cgroup_path = NULL;
753 u->cgroup_realized = false;
754 u->cgroup_realized_mask = 0;
758 pid_t unit_search_main_pid(Unit *u) {
759 _cleanup_fclose_ FILE *f = NULL;
760 pid_t pid = 0, npid, mypid;
767 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
771 while (cg_read_pid(f, &npid) > 0) {
777 /* Ignore processes that aren't our kids */
778 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
782 /* Dang, there's more than one daemonized PID
783 in this group, so we don't know what process
784 is the main process. */
795 int manager_setup_cgroup(Manager *m) {
796 _cleanup_free_ char *path = NULL;
802 /* 0. Be nice to Ingo Molnar #628004 */
803 if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
804 log_warning("No control group support available, not creating root group.");
808 /* 1. Determine hierarchy */
809 free(m->cgroup_root);
810 m->cgroup_root = NULL;
812 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
814 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
818 /* LEGACY: Already in /system.slice? If so, let's cut this
819 * off. This is to support live upgrades from older systemd
820 * versions where PID 1 was moved there. */
821 if (m->running_as == SYSTEMD_SYSTEM) {
822 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
824 e = endswith(m->cgroup_root, "/system");
829 /* And make sure to store away the root value without trailing
830 * slash, even for the root dir, so that we can easily prepend
832 if (streq(m->cgroup_root, "/"))
833 m->cgroup_root[0] = 0;
836 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
838 log_error("Cannot find cgroup mount point: %s", strerror(-r));
842 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
844 /* 3. Install agent */
845 if (m->running_as == SYSTEMD_SYSTEM) {
846 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
848 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
850 log_debug("Installed release agent.");
852 log_debug("Release agent already installed.");
855 /* 4. Make sure we are in the root cgroup */
856 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
858 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
862 /* 5. And pin it, so that it cannot be unmounted */
863 if (m->pin_cgroupfs_fd >= 0)
864 close_nointr_nofail(m->pin_cgroupfs_fd);
866 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
868 log_error("Failed to open pin file: %m");
872 /* 6. Figure out which controllers are supported */
873 m->cgroup_supported = cg_mask_supported();
875 /* 7. Always enable hierarchial support if it exists... */
876 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
881 void manager_shutdown_cgroup(Manager *m, bool delete) {
884 /* We can't really delete the group, since we are in it. But
886 if (delete && m->cgroup_root)
887 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
889 if (m->pin_cgroupfs_fd >= 0) {
890 close_nointr_nofail(m->pin_cgroupfs_fd);
891 m->pin_cgroupfs_fd = -1;
894 free(m->cgroup_root);
895 m->cgroup_root = NULL;
898 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
905 u = hashmap_get(m->cgroup_unit, cgroup);
919 u = hashmap_get(m->cgroup_unit, p);
925 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
926 _cleanup_free_ char *cgroup = NULL;
934 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
938 return manager_get_unit_by_cgroup(m, cgroup);
941 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
948 u = manager_get_unit_by_cgroup(m, cgroup);
950 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
952 if (UNIT_VTABLE(u)->notify_cgroup_empty)
953 UNIT_VTABLE(u)->notify_cgroup_empty(u);
955 unit_add_to_gc_queue(u);
962 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
963 [CGROUP_AUTO] = "auto",
964 [CGROUP_CLOSED] = "closed",
965 [CGROUP_STRICT] = "strict",
968 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);