1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "path-util.h"
26 #include "cgroup-util.h"
29 void cgroup_context_init(CGroupContext *c) {
32 /* Initialize everything to the kernel defaults, assuming the
33 * structure is preinitialized to 0 */
36 c->memory_limit = (uint64_t) -1;
37 c->blockio_weight = 1000;
40 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
44 LIST_REMOVE(device_allow, c->device_allow, a);
49 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
53 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
58 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
62 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
67 void cgroup_context_done(CGroupContext *c) {
70 while (c->blockio_device_weights)
71 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
73 while (c->blockio_device_bandwidths)
74 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
76 while (c->device_allow)
77 cgroup_context_free_device_allow(c, c->device_allow);
80 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
81 CGroupBlockIODeviceBandwidth *b;
82 CGroupBlockIODeviceWeight *w;
88 prefix = strempty(prefix);
91 "%sCPUAccounting=%s\n"
92 "%sBlockIOAccounting=%s\n"
93 "%sMemoryAccounting=%s\n"
95 "%sBlockIOWeight=%lu\n"
96 "%sMemoryLimit=%" PRIu64 "\n"
97 "%sDevicePolicy=%s\n",
98 prefix, yes_no(c->cpu_accounting),
99 prefix, yes_no(c->blockio_accounting),
100 prefix, yes_no(c->memory_accounting),
101 prefix, c->cpu_shares,
102 prefix, c->blockio_weight,
103 prefix, c->memory_limit,
104 prefix, cgroup_device_policy_to_string(c->device_policy));
106 LIST_FOREACH(device_allow, a, c->device_allow)
108 "%sDeviceAllow=%s %s%s%s\n",
111 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
113 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
115 "%sBlockIODeviceWeight=%s %lu",
120 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
121 char buf[FORMAT_BYTES_MAX];
126 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
128 format_bytes(buf, sizeof(buf), b->bandwidth));
132 static int lookup_blkio_device(const char *p, dev_t *dev) {
141 log_warning("Couldn't stat device %s: %m", p);
145 if (S_ISBLK(st.st_mode))
147 else if (major(st.st_dev) != 0) {
148 /* If this is not a device node then find the block
149 * device this file is stored on */
152 /* If this is a partition, try to get the originating
154 block_get_whole_disk(*dev, dev);
156 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
163 static int whitelist_device(const char *path, const char *node, const char *acc) {
164 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
171 if (stat(node, &st) < 0) {
172 log_warning("Couldn't stat device %s", node);
176 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
177 log_warning("%s is not a device.", node);
183 S_ISCHR(st.st_mode) ? 'c' : 'b',
184 major(st.st_rdev), minor(st.st_rdev),
187 r = cg_set_attribute("devices", path, "devices.allow", buf);
189 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
194 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
203 if (mask & CGROUP_CPU) {
204 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
206 sprintf(buf, "%lu\n", c->cpu_shares);
207 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
209 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
212 if (mask & CGROUP_BLKIO) {
213 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
214 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
215 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
216 CGroupBlockIODeviceWeight *w;
217 CGroupBlockIODeviceBandwidth *b;
219 sprintf(buf, "%lu\n", c->blockio_weight);
220 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
222 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
224 /* FIXME: no way to reset this list */
225 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
228 r = lookup_blkio_device(w->path, &dev);
232 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
233 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
235 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
238 /* FIXME: no way to reset this list */
239 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
243 r = lookup_blkio_device(b->path, &dev);
247 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
249 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
250 r = cg_set_attribute("blkio", path, a, buf);
252 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
256 if (mask & CGROUP_MEMORY) {
257 if (c->memory_limit != (uint64_t) -1) {
258 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
260 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
261 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
263 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
266 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
269 if (mask & CGROUP_DEVICE) {
270 CGroupDeviceAllow *a;
272 if (c->device_allow || c->device_policy != CGROUP_AUTO)
273 r = cg_set_attribute("devices", path, "devices.deny", "a");
275 r = cg_set_attribute("devices", path, "devices.allow", "a");
277 log_error("Failed to reset devices.list on %s: %s", path, strerror(-r));
279 if (c->device_policy == CGROUP_CLOSED ||
280 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
281 static const char auto_devices[] =
285 "/dev/random\0" "rw\0"
286 "/dev/urandom\0" "rw\0";
290 NULSTR_FOREACH_PAIR(x, y, auto_devices)
291 whitelist_device(path, x, y);
294 LIST_FOREACH(device_allow, a, c->device_allow) {
309 whitelist_device(path, a->path, acc);
314 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
315 CGroupControllerMask mask = 0;
317 /* Figure out which controllers we need */
319 if (c->cpu_accounting || c->cpu_shares != 1024)
320 mask |= CGROUP_CPUACCT | CGROUP_CPU;
322 if (c->blockio_accounting ||
323 c->blockio_weight != 1000 ||
324 c->blockio_device_weights ||
325 c->blockio_device_bandwidths)
326 mask |= CGROUP_BLKIO;
328 if (c->memory_accounting ||
329 c->memory_limit != (uint64_t) -1)
330 mask |= CGROUP_MEMORY;
332 if (c->device_allow || c->device_policy != CGROUP_AUTO)
333 mask |= CGROUP_DEVICE;
338 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
341 c = unit_get_cgroup_context(u);
345 return cgroup_context_get_mask(c);
348 CGroupControllerMask unit_get_members_mask(Unit *u) {
351 if (u->cgroup_members_mask_valid)
352 return u->cgroup_members_mask;
354 u->cgroup_members_mask = 0;
356 if (u->type == UNIT_SLICE) {
360 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
365 if (UNIT_DEREF(member->slice) != u)
368 u->cgroup_members_mask |=
369 unit_get_cgroup_mask(member) |
370 unit_get_members_mask(member);
374 u->cgroup_members_mask_valid = true;
375 return u->cgroup_members_mask;
378 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
379 CGroupControllerMask m;
383 if (UNIT_ISSET(u->slice))
384 m = unit_get_members_mask(UNIT_DEREF(u->slice));
386 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
388 /* Sibling propagation is only relevant for weight-based
389 * controllers, so let's mask out everything else */
390 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
393 CGroupControllerMask unit_get_target_mask(Unit *u) {
394 CGroupControllerMask mask;
396 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
397 mask &= u->manager->cgroup_supported;
402 /* Recurse from a unit up through its containing slices, propagating
403 * mask bits upward. A unit is also member of itself. */
404 void unit_update_cgroup_members_masks(Unit *u) {
405 CGroupControllerMask m;
410 /* Calculate subtree mask */
411 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
413 /* See if anything changed from the previous invocation. If
414 * not, we're done. */
415 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
419 u->cgroup_subtree_mask_valid &&
420 ((m & ~u->cgroup_subtree_mask) != 0) &&
421 ((~m & u->cgroup_subtree_mask) == 0);
423 u->cgroup_subtree_mask = m;
424 u->cgroup_subtree_mask_valid = true;
426 if (UNIT_ISSET(u->slice)) {
427 Unit *s = UNIT_DEREF(u->slice);
430 /* There's more set now than before. We
431 * propagate the new mask to the parent's mask
432 * (not caring if it actually was valid or
435 s->cgroup_members_mask |= m;
438 /* There's less set now than before (or we
439 * don't know), we need to recalculate
440 * everything, so let's invalidate the
441 * parent's members mask */
443 s->cgroup_members_mask_valid = false;
445 /* And now make sure that this change also hits our
447 unit_update_cgroup_members_masks(s);
451 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
458 if (u->cgroup_path &&
459 u->cgroup_realized &&
460 (u->cgroup_realized_mask & mask) == mask)
461 return u->cgroup_path;
463 u = UNIT_DEREF(u->slice);
469 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
470 _cleanup_free_ char *path = NULL;
475 path = unit_default_cgroup_path(u);
479 r = hashmap_put(u->manager->cgroup_unit, path, u);
481 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
485 u->cgroup_path = path;
489 /* First, create our own group */
490 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
492 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
496 /* Keep track that this is now realized */
497 u->cgroup_realized = true;
498 u->cgroup_realized_mask = mask;
500 /* Then, possibly move things over */
501 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
503 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
508 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
511 return u->cgroup_realized && u->cgroup_realized_mask == mask;
514 /* Check if necessary controllers and attributes for a unit are in place.
517 * If not, create paths, move processes over, and set attributes.
519 * Returns 0 on success and < 0 on failure. */
520 static int unit_realize_cgroup_now(Unit *u) {
521 CGroupControllerMask mask;
526 if (u->in_cgroup_queue) {
527 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
528 u->in_cgroup_queue = false;
531 mask = unit_get_target_mask(u);
533 if (unit_has_mask_realized(u, mask))
536 /* First, realize parents */
537 if (UNIT_ISSET(u->slice)) {
538 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
543 /* And then do the real work */
544 r = unit_create_cgroups(u, mask);
548 /* Finally, apply the necessary attributes. */
549 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
554 static void unit_add_to_cgroup_queue(Unit *u) {
556 if (u->in_cgroup_queue)
559 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
560 u->in_cgroup_queue = true;
563 unsigned manager_dispatch_cgroup_queue(Manager *m) {
568 while ((i = m->cgroup_queue)) {
569 assert(i->in_cgroup_queue);
571 r = unit_realize_cgroup_now(i);
573 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
581 static void unit_queue_siblings(Unit *u) {
584 /* This adds the siblings of the specified unit and the
585 * siblings of all parent units to the cgroup queue. (But
586 * neither the specified unit itself nor the parents.) */
588 while ((slice = UNIT_DEREF(u->slice))) {
592 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
596 /* Skip units that have a dependency on the slice
597 * but aren't actually in it. */
598 if (UNIT_DEREF(m->slice) != slice)
601 /* No point in doing cgroup application for units
602 * without active processes. */
603 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
606 /* If the unit doesn't need any new controllers
607 * and has current ones realized, it doesn't need
609 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
612 unit_add_to_cgroup_queue(m);
619 int unit_realize_cgroup(Unit *u) {
624 c = unit_get_cgroup_context(u);
628 /* So, here's the deal: when realizing the cgroups for this
629 * unit, we need to first create all parents, but there's more
630 * actually: for the weight-based controllers we also need to
631 * make sure that all our siblings (i.e. units that are in the
632 * same slice as we are) have cgroups, too. Otherwise things
633 * would become very uneven as each of their processes would
634 * get as much resources as all our group together. This call
635 * will synchronously create the parent cgroups, but will
636 * defer work on the siblings to the next event loop
639 /* Add all sibling slices to the cgroup queue. */
640 unit_queue_siblings(u);
642 /* And realize this one now (and apply the values) */
643 return unit_realize_cgroup_now(u);
646 void unit_destroy_cgroup(Unit *u) {
654 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
656 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
658 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
660 free(u->cgroup_path);
661 u->cgroup_path = NULL;
662 u->cgroup_realized = false;
663 u->cgroup_realized_mask = 0;
667 pid_t unit_search_main_pid(Unit *u) {
668 _cleanup_fclose_ FILE *f = NULL;
669 pid_t pid = 0, npid, mypid;
676 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
680 while (cg_read_pid(f, &npid) > 0) {
686 /* Ignore processes that aren't our kids */
687 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
691 /* Dang, there's more than one daemonized PID
692 in this group, so we don't know what process
693 is the main process. */
704 int manager_setup_cgroup(Manager *m) {
705 _cleanup_free_ char *path = NULL;
711 /* 0. Be nice to Ingo Molnar #628004 */
712 if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
713 log_warning("No control group support available, not creating root group.");
717 /* 1. Determine hierarchy */
718 free(m->cgroup_root);
719 m->cgroup_root = NULL;
721 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
723 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
727 /* LEGACY: Already in /system.slice? If so, let's cut this
728 * off. This is to support live upgrades from older systemd
729 * versions where PID 1 was moved there. */
730 if (m->running_as == SYSTEMD_SYSTEM) {
731 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
733 e = endswith(m->cgroup_root, "/system");
738 /* And make sure to store away the root value without trailing
739 * slash, even for the root dir, so that we can easily prepend
741 if (streq(m->cgroup_root, "/"))
742 m->cgroup_root[0] = 0;
745 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
747 log_error("Cannot find cgroup mount point: %s", strerror(-r));
751 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
753 /* 3. Install agent */
754 if (m->running_as == SYSTEMD_SYSTEM) {
755 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
757 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
759 log_debug("Installed release agent.");
761 log_debug("Release agent already installed.");
764 /* 4. Make sure we are in the root cgroup */
765 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
767 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
771 /* 5. And pin it, so that it cannot be unmounted */
772 if (m->pin_cgroupfs_fd >= 0)
773 close_nointr_nofail(m->pin_cgroupfs_fd);
775 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
777 log_error("Failed to open pin file: %m");
781 /* 6. Figure out which controllers are supported */
782 m->cgroup_supported = cg_mask_supported();
784 /* 7. Always enable hierarchial support if it exists... */
785 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
790 void manager_shutdown_cgroup(Manager *m, bool delete) {
793 /* We can't really delete the group, since we are in it. But
795 if (delete && m->cgroup_root)
796 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
798 if (m->pin_cgroupfs_fd >= 0) {
799 close_nointr_nofail(m->pin_cgroupfs_fd);
800 m->pin_cgroupfs_fd = -1;
803 free(m->cgroup_root);
804 m->cgroup_root = NULL;
807 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
814 u = hashmap_get(m->cgroup_unit, cgroup);
828 u = hashmap_get(m->cgroup_unit, p);
834 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
835 _cleanup_free_ char *cgroup = NULL;
843 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
847 return manager_get_unit_by_cgroup(m, cgroup);
850 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
857 u = manager_get_unit_by_cgroup(m, cgroup);
859 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
861 if (UNIT_VTABLE(u)->notify_cgroup_empty)
862 UNIT_VTABLE(u)->notify_cgroup_empty(u);
864 unit_add_to_gc_queue(u);
871 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
872 [CGROUP_AUTO] = "auto",
873 [CGROUP_CLOSED] = "closed",
874 [CGROUP_STRICT] = "strict",
877 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);