1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include "path-util.h"
27 #include "cgroup-util.h"
30 void cgroup_context_init(CGroupContext *c) {
33 /* Initialize everything to the kernel defaults, assuming the
34 * structure is preinitialized to 0 */
37 c->memory_limit = (uint64_t) -1;
38 c->blockio_weight = 1000;
41 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
45 LIST_REMOVE(device_allow, c->device_allow, a);
50 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
54 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
59 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
63 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
68 void cgroup_context_done(CGroupContext *c) {
71 while (c->blockio_device_weights)
72 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
74 while (c->blockio_device_bandwidths)
75 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
77 while (c->device_allow)
78 cgroup_context_free_device_allow(c, c->device_allow);
81 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
82 CGroupBlockIODeviceBandwidth *b;
83 CGroupBlockIODeviceWeight *w;
89 prefix = strempty(prefix);
92 "%sCPUAccounting=%s\n"
93 "%sBlockIOAccounting=%s\n"
94 "%sMemoryAccounting=%s\n"
96 "%sBlockIOWeight=%lu\n"
97 "%sMemoryLimit=%" PRIu64 "\n"
98 "%sDevicePolicy=%s\n",
99 prefix, yes_no(c->cpu_accounting),
100 prefix, yes_no(c->blockio_accounting),
101 prefix, yes_no(c->memory_accounting),
102 prefix, c->cpu_shares,
103 prefix, c->blockio_weight,
104 prefix, c->memory_limit,
105 prefix, cgroup_device_policy_to_string(c->device_policy));
107 LIST_FOREACH(device_allow, a, c->device_allow)
109 "%sDeviceAllow=%s %s%s%s\n",
112 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
114 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
116 "%sBlockIODeviceWeight=%s %lu",
121 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
122 char buf[FORMAT_BYTES_MAX];
127 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
129 format_bytes(buf, sizeof(buf), b->bandwidth));
133 static int lookup_blkio_device(const char *p, dev_t *dev) {
142 log_warning("Couldn't stat device %s: %m", p);
146 if (S_ISBLK(st.st_mode))
148 else if (major(st.st_dev) != 0) {
149 /* If this is not a device node then find the block
150 * device this file is stored on */
153 /* If this is a partition, try to get the originating
155 block_get_whole_disk(*dev, dev);
157 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
164 static int whitelist_device(const char *path, const char *node, const char *acc) {
165 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
172 if (stat(node, &st) < 0) {
173 log_warning("Couldn't stat device %s", node);
177 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
178 log_warning("%s is not a device.", node);
184 S_ISCHR(st.st_mode) ? 'c' : 'b',
185 major(st.st_rdev), minor(st.st_rdev),
188 r = cg_set_attribute("devices", path, "devices.allow", buf);
190 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
195 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
196 _cleanup_fclose_ FILE *f = NULL;
203 assert(type == 'b' || type == 'c');
205 f = fopen("/proc/devices", "re");
207 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
211 FOREACH_LINE(line, f, goto fail) {
212 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
217 if (type == 'c' && streq(line, "Character devices:")) {
222 if (type == 'b' && streq(line, "Block devices:")) {
237 w = strpbrk(p, WHITESPACE);
242 r = safe_atou(p, &maj);
249 w += strspn(w, WHITESPACE);
251 if (fnmatch(name, w, 0) != 0)
260 r = cg_set_attribute("devices", path, "devices.allow", buf);
262 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
268 log_warning("Failed to read /proc/devices: %m");
272 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
282 /* Some cgroup attributes are not support on the root cgroup,
283 * hence silently ignore */
284 is_root = isempty(path) || path_equal(path, "/");
286 if ((mask & CGROUP_CPU) && !is_root) {
287 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
289 sprintf(buf, "%lu\n", c->cpu_shares);
290 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
292 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
295 if (mask & CGROUP_BLKIO) {
296 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
297 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
298 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
299 CGroupBlockIODeviceWeight *w;
300 CGroupBlockIODeviceBandwidth *b;
303 sprintf(buf, "%lu\n", c->blockio_weight);
304 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
306 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
308 /* FIXME: no way to reset this list */
309 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
312 r = lookup_blkio_device(w->path, &dev);
316 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
317 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
319 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
323 /* FIXME: no way to reset this list */
324 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
328 r = lookup_blkio_device(b->path, &dev);
332 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
334 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
335 r = cg_set_attribute("blkio", path, a, buf);
337 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
341 if (mask & CGROUP_MEMORY) {
342 if (c->memory_limit != (uint64_t) -1) {
343 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
345 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
346 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
348 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
351 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
354 if ((mask & CGROUP_DEVICE) && !is_root) {
355 CGroupDeviceAllow *a;
357 if (c->device_allow || c->device_policy != CGROUP_AUTO)
358 r = cg_set_attribute("devices", path, "devices.deny", "a");
360 r = cg_set_attribute("devices", path, "devices.allow", "a");
362 log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
364 if (c->device_policy == CGROUP_CLOSED ||
365 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
366 static const char auto_devices[] =
367 "/dev/null\0" "rwm\0"
368 "/dev/zero\0" "rwm\0"
369 "/dev/full\0" "rwm\0"
370 "/dev/random\0" "rwm\0"
371 "/dev/urandom\0" "rwm\0"
373 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
377 NULSTR_FOREACH_PAIR(x, y, auto_devices)
378 whitelist_device(path, x, y);
380 whitelist_major(path, "pts", 'c', "rw");
381 whitelist_major(path, "kdbus", 'c', "rw");
382 whitelist_major(path, "kdbus/*", 'c', "rw");
385 LIST_FOREACH(device_allow, a, c->device_allow) {
401 if (startswith(a->path, "/dev/"))
402 whitelist_device(path, a->path, acc);
403 else if (startswith(a->path, "block-"))
404 whitelist_major(path, a->path + 6, 'b', acc);
405 else if (startswith(a->path, "char-"))
406 whitelist_major(path, a->path + 5, 'c', acc);
408 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
413 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
414 CGroupControllerMask mask = 0;
416 /* Figure out which controllers we need */
418 if (c->cpu_accounting || c->cpu_shares != 1024)
419 mask |= CGROUP_CPUACCT | CGROUP_CPU;
421 if (c->blockio_accounting ||
422 c->blockio_weight != 1000 ||
423 c->blockio_device_weights ||
424 c->blockio_device_bandwidths)
425 mask |= CGROUP_BLKIO;
427 if (c->memory_accounting ||
428 c->memory_limit != (uint64_t) -1)
429 mask |= CGROUP_MEMORY;
431 if (c->device_allow || c->device_policy != CGROUP_AUTO)
432 mask |= CGROUP_DEVICE;
437 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
440 c = unit_get_cgroup_context(u);
444 return cgroup_context_get_mask(c);
447 CGroupControllerMask unit_get_members_mask(Unit *u) {
450 if (u->cgroup_members_mask_valid)
451 return u->cgroup_members_mask;
453 u->cgroup_members_mask = 0;
455 if (u->type == UNIT_SLICE) {
459 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
464 if (UNIT_DEREF(member->slice) != u)
467 u->cgroup_members_mask |=
468 unit_get_cgroup_mask(member) |
469 unit_get_members_mask(member);
473 u->cgroup_members_mask_valid = true;
474 return u->cgroup_members_mask;
477 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
478 CGroupControllerMask m;
482 if (UNIT_ISSET(u->slice))
483 m = unit_get_members_mask(UNIT_DEREF(u->slice));
485 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
487 /* Sibling propagation is only relevant for weight-based
488 * controllers, so let's mask out everything else */
489 return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
492 CGroupControllerMask unit_get_target_mask(Unit *u) {
493 CGroupControllerMask mask;
495 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
496 mask &= u->manager->cgroup_supported;
501 /* Recurse from a unit up through its containing slices, propagating
502 * mask bits upward. A unit is also member of itself. */
503 void unit_update_cgroup_members_masks(Unit *u) {
504 CGroupControllerMask m;
509 /* Calculate subtree mask */
510 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
512 /* See if anything changed from the previous invocation. If
513 * not, we're done. */
514 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
518 u->cgroup_subtree_mask_valid &&
519 ((m & ~u->cgroup_subtree_mask) != 0) &&
520 ((~m & u->cgroup_subtree_mask) == 0);
522 u->cgroup_subtree_mask = m;
523 u->cgroup_subtree_mask_valid = true;
525 if (UNIT_ISSET(u->slice)) {
526 Unit *s = UNIT_DEREF(u->slice);
529 /* There's more set now than before. We
530 * propagate the new mask to the parent's mask
531 * (not caring if it actually was valid or
534 s->cgroup_members_mask |= m;
537 /* There's less set now than before (or we
538 * don't know), we need to recalculate
539 * everything, so let's invalidate the
540 * parent's members mask */
542 s->cgroup_members_mask_valid = false;
544 /* And now make sure that this change also hits our
546 unit_update_cgroup_members_masks(s);
550 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
557 if (u->cgroup_path &&
558 u->cgroup_realized &&
559 (u->cgroup_realized_mask & mask) == mask)
560 return u->cgroup_path;
562 u = UNIT_DEREF(u->slice);
568 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
569 _cleanup_free_ char *path = NULL;
574 path = unit_default_cgroup_path(u);
578 r = hashmap_put(u->manager->cgroup_unit, path, u);
580 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
584 u->cgroup_path = path;
588 /* First, create our own group */
589 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
591 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
595 /* Keep track that this is now realized */
596 u->cgroup_realized = true;
597 u->cgroup_realized_mask = mask;
599 /* Then, possibly move things over */
600 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
602 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
607 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
610 return u->cgroup_realized && u->cgroup_realized_mask == mask;
613 /* Check if necessary controllers and attributes for a unit are in place.
616 * If not, create paths, move processes over, and set attributes.
618 * Returns 0 on success and < 0 on failure. */
619 static int unit_realize_cgroup_now(Unit *u) {
620 CGroupControllerMask mask;
625 if (u->in_cgroup_queue) {
626 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
627 u->in_cgroup_queue = false;
630 mask = unit_get_target_mask(u);
632 if (unit_has_mask_realized(u, mask))
635 /* First, realize parents */
636 if (UNIT_ISSET(u->slice)) {
637 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
642 /* And then do the real work */
643 r = unit_create_cgroups(u, mask);
647 /* Finally, apply the necessary attributes. */
648 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
653 static void unit_add_to_cgroup_queue(Unit *u) {
655 if (u->in_cgroup_queue)
658 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
659 u->in_cgroup_queue = true;
662 unsigned manager_dispatch_cgroup_queue(Manager *m) {
667 while ((i = m->cgroup_queue)) {
668 assert(i->in_cgroup_queue);
670 r = unit_realize_cgroup_now(i);
672 log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
680 static void unit_queue_siblings(Unit *u) {
683 /* This adds the siblings of the specified unit and the
684 * siblings of all parent units to the cgroup queue. (But
685 * neither the specified unit itself nor the parents.) */
687 while ((slice = UNIT_DEREF(u->slice))) {
691 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
695 /* Skip units that have a dependency on the slice
696 * but aren't actually in it. */
697 if (UNIT_DEREF(m->slice) != slice)
700 /* No point in doing cgroup application for units
701 * without active processes. */
702 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
705 /* If the unit doesn't need any new controllers
706 * and has current ones realized, it doesn't need
708 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
711 unit_add_to_cgroup_queue(m);
718 int unit_realize_cgroup(Unit *u) {
723 c = unit_get_cgroup_context(u);
727 /* So, here's the deal: when realizing the cgroups for this
728 * unit, we need to first create all parents, but there's more
729 * actually: for the weight-based controllers we also need to
730 * make sure that all our siblings (i.e. units that are in the
731 * same slice as we are) have cgroups, too. Otherwise, things
732 * would become very uneven as each of their processes would
733 * get as much resources as all our group together. This call
734 * will synchronously create the parent cgroups, but will
735 * defer work on the siblings to the next event loop
738 /* Add all sibling slices to the cgroup queue. */
739 unit_queue_siblings(u);
741 /* And realize this one now (and apply the values) */
742 return unit_realize_cgroup_now(u);
745 void unit_destroy_cgroup(Unit *u) {
753 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
755 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
757 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
759 free(u->cgroup_path);
760 u->cgroup_path = NULL;
761 u->cgroup_realized = false;
762 u->cgroup_realized_mask = 0;
766 pid_t unit_search_main_pid(Unit *u) {
767 _cleanup_fclose_ FILE *f = NULL;
768 pid_t pid = 0, npid, mypid;
775 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
779 while (cg_read_pid(f, &npid) > 0) {
785 /* Ignore processes that aren't our kids */
786 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
790 /* Dang, there's more than one daemonized PID
791 in this group, so we don't know what process
792 is the main process. */
803 int manager_setup_cgroup(Manager *m) {
804 _cleanup_free_ char *path = NULL;
810 /* 0. Be nice to Ingo Molnar #628004 */
811 if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
812 log_warning("No control group support available, not creating root group.");
816 /* 1. Determine hierarchy */
817 free(m->cgroup_root);
818 m->cgroup_root = NULL;
820 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
822 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
826 /* LEGACY: Already in /system.slice? If so, let's cut this
827 * off. This is to support live upgrades from older systemd
828 * versions where PID 1 was moved there. */
829 if (m->running_as == SYSTEMD_SYSTEM) {
830 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
832 e = endswith(m->cgroup_root, "/system");
837 /* And make sure to store away the root value without trailing
838 * slash, even for the root dir, so that we can easily prepend
840 if (streq(m->cgroup_root, "/"))
841 m->cgroup_root[0] = 0;
844 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
846 log_error("Cannot find cgroup mount point: %s", strerror(-r));
850 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
852 /* 3. Install agent */
853 if (m->running_as == SYSTEMD_SYSTEM) {
854 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
856 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
858 log_debug("Installed release agent.");
860 log_debug("Release agent already installed.");
863 /* 4. Make sure we are in the root cgroup */
864 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
866 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
870 /* 5. And pin it, so that it cannot be unmounted */
871 safe_close(m->pin_cgroupfs_fd);
873 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
875 log_error("Failed to open pin file: %m");
879 /* 6. Figure out which controllers are supported */
880 m->cgroup_supported = cg_mask_supported();
882 /* 7. Always enable hierarchial support if it exists... */
883 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
888 void manager_shutdown_cgroup(Manager *m, bool delete) {
891 /* We can't really delete the group, since we are in it. But
893 if (delete && m->cgroup_root)
894 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
896 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
898 free(m->cgroup_root);
899 m->cgroup_root = NULL;
902 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
909 u = hashmap_get(m->cgroup_unit, cgroup);
923 u = hashmap_get(m->cgroup_unit, p);
929 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
930 _cleanup_free_ char *cgroup = NULL;
938 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
942 return manager_get_unit_by_cgroup(m, cgroup);
945 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
952 u = manager_get_unit_by_cgroup(m, cgroup);
954 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
956 if (UNIT_VTABLE(u)->notify_cgroup_empty)
957 UNIT_VTABLE(u)->notify_cgroup_empty(u);
959 unit_add_to_gc_queue(u);
966 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
967 [CGROUP_AUTO] = "auto",
968 [CGROUP_CLOSED] = "closed",
969 [CGROUP_STRICT] = "strict",
972 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);