1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "path-util.h"
26 #include "cgroup-util.h"
29 void cgroup_context_init(CGroupContext *c) {
32 /* Initialize everything to the kernel defaults, assuming the
33 * structure is preinitialized to 0 */
36 c->memory_limit = (uint64_t) -1;
37 c->blockio_weight = 1000;
40 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
44 LIST_REMOVE(device_allow, c->device_allow, a);
49 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
53 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
58 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
62 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
67 void cgroup_context_done(CGroupContext *c) {
70 while (c->blockio_device_weights)
71 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
73 while (c->blockio_device_bandwidths)
74 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
76 while (c->device_allow)
77 cgroup_context_free_device_allow(c, c->device_allow);
80 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
81 CGroupBlockIODeviceBandwidth *b;
82 CGroupBlockIODeviceWeight *w;
88 prefix = strempty(prefix);
91 "%sCPUAccounting=%s\n"
92 "%sBlockIOAccounting=%s\n"
93 "%sMemoryAccounting=%s\n"
95 "%sBlockIOWeight=%lu\n"
96 "%sMemoryLimit=%" PRIu64 "\n"
97 "%sDevicePolicy=%s\n",
98 prefix, yes_no(c->cpu_accounting),
99 prefix, yes_no(c->blockio_accounting),
100 prefix, yes_no(c->memory_accounting),
101 prefix, c->cpu_shares,
102 prefix, c->blockio_weight,
103 prefix, c->memory_limit,
104 prefix, cgroup_device_policy_to_string(c->device_policy));
106 LIST_FOREACH(device_allow, a, c->device_allow)
108 "%sDeviceAllow=%s %s%s%s\n",
111 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
113 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
115 "%sBlockIODeviceWeight=%s %lu",
120 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
121 char buf[FORMAT_BYTES_MAX];
126 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
128 format_bytes(buf, sizeof(buf), b->bandwidth));
132 static int lookup_blkio_device(const char *p, dev_t *dev) {
141 log_warning("Couldn't stat device %s: %m", p);
145 if (S_ISBLK(st.st_mode))
147 else if (major(st.st_dev) != 0) {
148 /* If this is not a device node then find the block
149 * device this file is stored on */
152 /* If this is a partition, try to get the originating
154 block_get_whole_disk(*dev, dev);
156 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
163 static int whitelist_device(const char *path, const char *node, const char *acc) {
164 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
171 if (stat(node, &st) < 0) {
172 log_warning("Couldn't stat device %s", node);
176 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
177 log_warning("%s is not a device.", node);
183 S_ISCHR(st.st_mode) ? 'c' : 'b',
184 major(st.st_rdev), minor(st.st_rdev),
187 r = cg_set_attribute("devices", path, "devices.allow", buf);
189 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
194 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
195 _cleanup_fclose_ FILE *f = NULL;
202 assert(type == 'b' || type == 'c');
204 f = fopen("/proc/devices", "re");
206 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
210 FOREACH_LINE(line, f, goto fail) {
211 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
216 if (type == 'c' && streq(line, "Character devices:")) {
221 if (type == 'b' && streq(line, "Block devices:")) {
236 w = strpbrk(p, WHITESPACE);
241 r = safe_atou(p, &maj);
248 w += strspn(w, WHITESPACE);
258 r = cg_set_attribute("devices", path, "devices.allow", buf);
260 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
266 log_warning("Failed to read /proc/devices: %m");
270 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
279 if (mask & CGROUP_CPU) {
280 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
282 sprintf(buf, "%lu\n", c->cpu_shares);
283 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
285 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
288 if (mask & CGROUP_BLKIO) {
289 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
290 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
291 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
292 CGroupBlockIODeviceWeight *w;
293 CGroupBlockIODeviceBandwidth *b;
295 sprintf(buf, "%lu\n", c->blockio_weight);
296 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
298 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
300 /* FIXME: no way to reset this list */
301 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
304 r = lookup_blkio_device(w->path, &dev);
308 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
309 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
311 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
314 /* FIXME: no way to reset this list */
315 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
319 r = lookup_blkio_device(b->path, &dev);
323 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
325 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
326 r = cg_set_attribute("blkio", path, a, buf);
328 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
332 if (mask & CGROUP_MEMORY) {
333 if (c->memory_limit != (uint64_t) -1) {
334 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
336 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
337 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
339 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
342 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
345 if (mask & CGROUP_DEVICE) {
346 CGroupDeviceAllow *a;
348 if (c->device_allow || c->device_policy != CGROUP_AUTO)
349 r = cg_set_attribute("devices", path, "devices.deny", "a");
351 r = cg_set_attribute("devices", path, "devices.allow", "a");
353 log_error("Failed to reset devices.list on %s: %s", path, strerror(-r));
355 if (c->device_policy == CGROUP_CLOSED ||
356 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
357 static const char auto_devices[] =
361 "/dev/random\0" "rw\0"
362 "/dev/urandom\0" "rw\0";
366 NULSTR_FOREACH_PAIR(x, y, auto_devices)
367 whitelist_device(path, x, y);
370 LIST_FOREACH(device_allow, a, c->device_allow) {
386 if (startswith(a->path, "/dev/"))
387 whitelist_device(path, a->path, acc);
388 else if (startswith(a->path, "block-"))
389 whitelist_major(path, a->path + 6, 'b', acc);
390 else if (startswith(a->path, "char-"))
391 whitelist_major(path, a->path + 5, 'c', acc);
393 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
398 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
399 CGroupControllerMask mask = 0;
401 /* Figure out which controllers we need */
403 if (c->cpu_accounting || c->cpu_shares != 1024)
404 mask |= CGROUP_CPUACCT | CGROUP_CPU;
406 if (c->blockio_accounting ||
407 c->blockio_weight != 1000 ||
408 c->blockio_device_weights ||
409 c->blockio_device_bandwidths)
410 mask |= CGROUP_BLKIO;
412 if (c->memory_accounting ||
413 c->memory_limit != (uint64_t) -1)
414 mask |= CGROUP_MEMORY;
416 if (c->device_allow || c->device_policy != CGROUP_AUTO)
417 mask |= CGROUP_DEVICE;
422 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
425 c = unit_get_cgroup_context(u);
429 return cgroup_context_get_mask(c);
432 CGroupControllerMask unit_get_members_mask(Unit *u) {
435 if (u->cgroup_members_mask_valid)
436 return u->cgroup_members_mask;
438 u->cgroup_members_mask = 0;
440 if (u->type == UNIT_SLICE) {
444 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
449 if (UNIT_DEREF(member->slice) != u)
452 u->cgroup_members_mask |=
453 unit_get_cgroup_mask(member) |
454 unit_get_members_mask(member);
458 u->cgroup_members_mask_valid = true;
459 return u->cgroup_members_mask;
462 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
463 CGroupControllerMask m;
467 if (UNIT_ISSET(u->slice))
468 m = unit_get_members_mask(UNIT_DEREF(u->slice));
470 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
472 /* Sibling propagation is only relevant for weight-based
473 * controllers, so let's mask out everything else */
474 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
477 CGroupControllerMask unit_get_target_mask(Unit *u) {
478 CGroupControllerMask mask;
480 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
481 mask &= u->manager->cgroup_supported;
486 /* Recurse from a unit up through its containing slices, propagating
487 * mask bits upward. A unit is also member of itself. */
488 void unit_update_cgroup_members_masks(Unit *u) {
489 CGroupControllerMask m;
494 /* Calculate subtree mask */
495 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
497 /* See if anything changed from the previous invocation. If
498 * not, we're done. */
499 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
503 u->cgroup_subtree_mask_valid &&
504 ((m & ~u->cgroup_subtree_mask) != 0) &&
505 ((~m & u->cgroup_subtree_mask) == 0);
507 u->cgroup_subtree_mask = m;
508 u->cgroup_subtree_mask_valid = true;
510 if (UNIT_ISSET(u->slice)) {
511 Unit *s = UNIT_DEREF(u->slice);
514 /* There's more set now than before. We
515 * propagate the new mask to the parent's mask
516 * (not caring if it actually was valid or
519 s->cgroup_members_mask |= m;
522 /* There's less set now than before (or we
523 * don't know), we need to recalculate
524 * everything, so let's invalidate the
525 * parent's members mask */
527 s->cgroup_members_mask_valid = false;
529 /* And now make sure that this change also hits our
531 unit_update_cgroup_members_masks(s);
535 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
542 if (u->cgroup_path &&
543 u->cgroup_realized &&
544 (u->cgroup_realized_mask & mask) == mask)
545 return u->cgroup_path;
547 u = UNIT_DEREF(u->slice);
553 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
554 _cleanup_free_ char *path = NULL;
559 path = unit_default_cgroup_path(u);
563 r = hashmap_put(u->manager->cgroup_unit, path, u);
565 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
569 u->cgroup_path = path;
573 /* First, create our own group */
574 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
576 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
580 /* Keep track that this is now realized */
581 u->cgroup_realized = true;
582 u->cgroup_realized_mask = mask;
584 /* Then, possibly move things over */
585 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
587 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
592 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
595 return u->cgroup_realized && u->cgroup_realized_mask == mask;
598 /* Check if necessary controllers and attributes for a unit are in place.
601 * If not, create paths, move processes over, and set attributes.
603 * Returns 0 on success and < 0 on failure. */
604 static int unit_realize_cgroup_now(Unit *u) {
605 CGroupControllerMask mask;
610 if (u->in_cgroup_queue) {
611 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
612 u->in_cgroup_queue = false;
615 mask = unit_get_target_mask(u);
617 if (unit_has_mask_realized(u, mask))
620 /* First, realize parents */
621 if (UNIT_ISSET(u->slice)) {
622 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
627 /* And then do the real work */
628 r = unit_create_cgroups(u, mask);
632 /* Finally, apply the necessary attributes. */
633 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
638 static void unit_add_to_cgroup_queue(Unit *u) {
640 if (u->in_cgroup_queue)
643 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
644 u->in_cgroup_queue = true;
647 unsigned manager_dispatch_cgroup_queue(Manager *m) {
652 while ((i = m->cgroup_queue)) {
653 assert(i->in_cgroup_queue);
655 r = unit_realize_cgroup_now(i);
657 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
665 static void unit_queue_siblings(Unit *u) {
668 /* This adds the siblings of the specified unit and the
669 * siblings of all parent units to the cgroup queue. (But
670 * neither the specified unit itself nor the parents.) */
672 while ((slice = UNIT_DEREF(u->slice))) {
676 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
680 /* Skip units that have a dependency on the slice
681 * but aren't actually in it. */
682 if (UNIT_DEREF(m->slice) != slice)
685 /* No point in doing cgroup application for units
686 * without active processes. */
687 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
690 /* If the unit doesn't need any new controllers
691 * and has current ones realized, it doesn't need
693 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
696 unit_add_to_cgroup_queue(m);
703 int unit_realize_cgroup(Unit *u) {
708 c = unit_get_cgroup_context(u);
712 /* So, here's the deal: when realizing the cgroups for this
713 * unit, we need to first create all parents, but there's more
714 * actually: for the weight-based controllers we also need to
715 * make sure that all our siblings (i.e. units that are in the
716 * same slice as we are) have cgroups, too. Otherwise, things
717 * would become very uneven as each of their processes would
718 * get as much resources as all our group together. This call
719 * will synchronously create the parent cgroups, but will
720 * defer work on the siblings to the next event loop
723 /* Add all sibling slices to the cgroup queue. */
724 unit_queue_siblings(u);
726 /* And realize this one now (and apply the values) */
727 return unit_realize_cgroup_now(u);
730 void unit_destroy_cgroup(Unit *u) {
738 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
740 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
742 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
744 free(u->cgroup_path);
745 u->cgroup_path = NULL;
746 u->cgroup_realized = false;
747 u->cgroup_realized_mask = 0;
751 pid_t unit_search_main_pid(Unit *u) {
752 _cleanup_fclose_ FILE *f = NULL;
753 pid_t pid = 0, npid, mypid;
760 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
764 while (cg_read_pid(f, &npid) > 0) {
770 /* Ignore processes that aren't our kids */
771 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
775 /* Dang, there's more than one daemonized PID
776 in this group, so we don't know what process
777 is the main process. */
788 int manager_setup_cgroup(Manager *m) {
789 _cleanup_free_ char *path = NULL;
795 /* 0. Be nice to Ingo Molnar #628004 */
796 if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
797 log_warning("No control group support available, not creating root group.");
801 /* 1. Determine hierarchy */
802 free(m->cgroup_root);
803 m->cgroup_root = NULL;
805 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
807 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
811 /* LEGACY: Already in /system.slice? If so, let's cut this
812 * off. This is to support live upgrades from older systemd
813 * versions where PID 1 was moved there. */
814 if (m->running_as == SYSTEMD_SYSTEM) {
815 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
817 e = endswith(m->cgroup_root, "/system");
822 /* And make sure to store away the root value without trailing
823 * slash, even for the root dir, so that we can easily prepend
825 if (streq(m->cgroup_root, "/"))
826 m->cgroup_root[0] = 0;
829 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
831 log_error("Cannot find cgroup mount point: %s", strerror(-r));
835 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
837 /* 3. Install agent */
838 if (m->running_as == SYSTEMD_SYSTEM) {
839 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
841 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
843 log_debug("Installed release agent.");
845 log_debug("Release agent already installed.");
848 /* 4. Make sure we are in the root cgroup */
849 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
851 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
855 /* 5. And pin it, so that it cannot be unmounted */
856 if (m->pin_cgroupfs_fd >= 0)
857 close_nointr_nofail(m->pin_cgroupfs_fd);
859 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
861 log_error("Failed to open pin file: %m");
865 /* 6. Figure out which controllers are supported */
866 m->cgroup_supported = cg_mask_supported();
868 /* 7. Always enable hierarchial support if it exists... */
869 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
874 void manager_shutdown_cgroup(Manager *m, bool delete) {
877 /* We can't really delete the group, since we are in it. But
879 if (delete && m->cgroup_root)
880 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
882 if (m->pin_cgroupfs_fd >= 0) {
883 close_nointr_nofail(m->pin_cgroupfs_fd);
884 m->pin_cgroupfs_fd = -1;
887 free(m->cgroup_root);
888 m->cgroup_root = NULL;
891 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
898 u = hashmap_get(m->cgroup_unit, cgroup);
912 u = hashmap_get(m->cgroup_unit, p);
918 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
919 _cleanup_free_ char *cgroup = NULL;
927 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
931 return manager_get_unit_by_cgroup(m, cgroup);
934 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
941 u = manager_get_unit_by_cgroup(m, cgroup);
943 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
945 if (UNIT_VTABLE(u)->notify_cgroup_empty)
946 UNIT_VTABLE(u)->notify_cgroup_empty(u);
948 unit_add_to_gc_queue(u);
955 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
956 [CGROUP_AUTO] = "auto",
957 [CGROUP_CLOSED] = "closed",
958 [CGROUP_STRICT] = "strict",
961 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);