1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "path-util.h"
26 #include "cgroup-util.h"
29 void cgroup_context_init(CGroupContext *c) {
32 /* Initialize everything to the kernel defaults, assuming the
33 * structure is preinitialized to 0 */
36 c->memory_limit = c->memory_soft_limit = (uint64_t) -1;
37 c->blockio_weight = 1000;
40 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
44 LIST_REMOVE(CGroupDeviceAllow, device_allow, c->device_allow, a);
49 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
53 LIST_REMOVE(CGroupBlockIODeviceWeight, device_weights, c->blockio_device_weights, w);
58 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
62 LIST_REMOVE(CGroupBlockIODeviceBandwidth, device_bandwidths, c->blockio_device_bandwidths, b);
67 void cgroup_context_done(CGroupContext *c) {
70 while (c->blockio_device_weights)
71 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
73 while (c->blockio_device_bandwidths)
74 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
76 while (c->device_allow)
77 cgroup_context_free_device_allow(c, c->device_allow);
80 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
81 CGroupBlockIODeviceBandwidth *b;
82 CGroupBlockIODeviceWeight *w;
88 prefix = strempty(prefix);
91 "%sCPUAccounting=%s\n"
92 "%sBlockIOAccounting=%s\n"
93 "%sMemoryAccounting=%s\n"
95 "%sBlockIOWeight%lu\n"
96 "%sMemoryLimit=%" PRIu64 "\n"
97 "%sMemorySoftLimit=%" PRIu64 "\n"
98 "%sDevicePolicy=%s\n",
99 prefix, yes_no(c->cpu_accounting),
100 prefix, yes_no(c->blockio_accounting),
101 prefix, yes_no(c->memory_accounting),
102 prefix, c->cpu_shares,
103 prefix, c->blockio_weight,
104 prefix, c->memory_limit,
105 prefix, c->memory_soft_limit,
106 prefix, cgroup_device_policy_to_string(c->device_policy));
108 LIST_FOREACH(device_allow, a, c->device_allow)
110 "%sDeviceAllow=%s %s%s%s\n",
113 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
115 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
117 "%sBlockIODeviceWeight=%s %lu",
122 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
123 char buf[FORMAT_BYTES_MAX];
128 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
130 format_bytes(buf, sizeof(buf), b->bandwidth));
134 static int lookup_blkio_device(const char *p, dev_t *dev) {
143 log_warning("Couldn't stat device %s: %m", p);
147 if (S_ISBLK(st.st_mode))
149 else if (major(st.st_dev) != 0) {
150 /* If this is not a device node then find the block
151 * device this file is stored on */
154 /* If this is a partition, try to get the originating
156 block_get_whole_disk(*dev, dev);
158 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
165 static int whitelist_device(const char *path, const char *node, const char *acc) {
166 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
173 if (stat(node, &st) < 0) {
174 log_warning("Couldn't stat device %s", node);
178 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
179 log_warning("%s is not a device.", node);
185 S_ISCHR(st.st_mode) ? 'c' : 'b',
186 major(st.st_rdev), minor(st.st_rdev),
189 r = cg_set_attribute("devices", path, "devices.allow", buf);
191 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
196 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
205 if (mask & CGROUP_CPU) {
206 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
208 sprintf(buf, "%lu\n", c->cpu_shares);
209 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
211 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
214 if (mask & CGROUP_BLKIO) {
215 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
216 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
217 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
218 CGroupBlockIODeviceWeight *w;
219 CGroupBlockIODeviceBandwidth *b;
221 sprintf(buf, "%lu\n", c->blockio_weight);
222 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
224 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
226 /* FIXME: no way to reset this list */
227 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
230 r = lookup_blkio_device(w->path, &dev);
234 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
235 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
237 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
240 /* FIXME: no way to reset this list */
241 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
245 r = lookup_blkio_device(b->path, &dev);
249 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
251 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
252 r = cg_set_attribute("blkio", path, a, buf);
254 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
258 if (mask & CGROUP_MEMORY) {
259 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
260 if (c->memory_limit != (uint64_t) -1) {
261 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
262 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
264 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
267 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
269 if (c->memory_soft_limit != (uint64_t) -1) {
270 sprintf(buf, "%" PRIu64 "\n", c->memory_soft_limit);
271 r = cg_set_attribute("memory", path, "memory.soft_limit_in_bytes", buf);
273 r = cg_set_attribute("memory", path, "memory.soft_limit_in_bytes", "-1");
276 log_error("Failed to set memory.soft_limit_in_bytes on %s: %s", path, strerror(-r));
279 if (mask & CGROUP_DEVICE) {
280 CGroupDeviceAllow *a;
282 if (c->device_allow || c->device_policy != CGROUP_AUTO)
283 r = cg_set_attribute("devices", path, "devices.deny", "a");
285 r = cg_set_attribute("devices", path, "devices.allow", "a");
287 log_error("Failed to reset devices.list on %s: %s", path, strerror(-r));
289 if (c->device_policy == CGROUP_CLOSED ||
290 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
291 static const char auto_devices[] =
295 "/dev/random\0" "rw\0"
296 "/dev/urandom\0" "rw\0";
300 NULSTR_FOREACH_PAIR(x, y, auto_devices)
301 whitelist_device(path, x, y);
304 LIST_FOREACH(device_allow, a, c->device_allow) {
319 whitelist_device(path, a->path, acc);
324 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
325 CGroupControllerMask mask = 0;
327 /* Figure out which controllers we need */
329 if (c->cpu_accounting || c->cpu_shares != 1024)
330 mask |= CGROUP_CPUACCT | CGROUP_CPU;
332 if (c->blockio_accounting ||
333 c->blockio_weight != 1000 ||
334 c->blockio_device_weights ||
335 c->blockio_device_bandwidths)
336 mask |= CGROUP_BLKIO;
338 if (c->memory_accounting ||
339 c->memory_limit != (uint64_t) -1 ||
340 c->memory_soft_limit != (uint64_t) -1)
341 mask |= CGROUP_MEMORY;
343 if (c->device_allow || c->device_policy != CGROUP_AUTO)
344 mask |= CGROUP_DEVICE;
349 static CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
352 c = unit_get_cgroup_context(u);
356 return cgroup_context_get_mask(c);
359 static CGroupControllerMask unit_get_members_mask(Unit *u) {
360 CGroupControllerMask mask = 0;
366 SET_FOREACH(m, u->dependencies[UNIT_BEFORE], i) {
368 if (UNIT_DEREF(m->slice) != u)
371 mask |= unit_get_cgroup_mask(m) | unit_get_members_mask(m);
377 static CGroupControllerMask unit_get_siblings_mask(Unit *u) {
380 if (!UNIT_ISSET(u->slice))
383 /* Sibling propagation is only relevant for weight-based
384 * controllers, so let's mask out everything else */
385 return unit_get_members_mask(UNIT_DEREF(u->slice)) &
386 (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
389 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
392 bool is_in_hash = false;
396 path = unit_default_cgroup_path(u);
400 r = hashmap_put(u->manager->cgroup_unit, path, u);
405 log_error("cgroup %s exists already: %s", path, strerror(-r));
410 /* First, create our own group */
411 r = cg_create_with_mask(mask, path);
413 log_error("Failed to create cgroup %s: %s", path, strerror(-r));
415 /* Then, possibly move things over */
416 if (u->cgroup_path && !streq(path, u->cgroup_path)) {
417 r = cg_migrate_with_mask(mask, u->cgroup_path, path);
419 log_error("Failed to migrate cgroup %s: %s", path, strerror(-r));
423 /* And remember the new data */
424 free(u->cgroup_path);
425 u->cgroup_path = path;
428 u->cgroup_realized = true;
429 u->cgroup_mask = mask;
434 static int unit_realize_cgroup_now(Unit *u) {
435 CGroupControllerMask mask;
439 if (u->in_cgroup_queue) {
440 LIST_REMOVE(Unit, cgroup_queue, u->manager->cgroup_queue, u);
441 u->in_cgroup_queue = false;
444 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
445 mask &= u->manager->cgroup_supported;
447 if (u->cgroup_realized &&
448 u->cgroup_mask == mask)
451 /* First, realize parents */
452 if (UNIT_ISSET(u->slice))
453 unit_realize_cgroup_now(UNIT_DEREF(u->slice));
455 /* And then do the real work */
456 return unit_create_cgroups(u, mask);
459 static void unit_add_to_cgroup_queue(Unit *u) {
461 if (u->in_cgroup_queue)
464 LIST_PREPEND(Unit, cgroup_queue, u->manager->cgroup_queue, u);
465 u->in_cgroup_queue = true;
468 unsigned manager_dispatch_cgroup_queue(Manager *m) {
472 while ((i = m->cgroup_queue)) {
473 assert(i->in_cgroup_queue);
475 if (unit_realize_cgroup_now(i) >= 0)
476 cgroup_context_apply(unit_get_cgroup_context(i), i->cgroup_mask, i->cgroup_path);
484 static void unit_queue_siblings(Unit *u) {
487 /* This adds the siblings of the specified unit and the
488 * siblings of all parent units to the cgroup queue. (But
489 * neither the specified unit itself nor the parents.) */
491 while ((slice = UNIT_DEREF(u->slice))) {
495 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
499 if (UNIT_DEREF(m->slice) != slice)
502 unit_add_to_cgroup_queue(m);
509 int unit_realize_cgroup(Unit *u) {
515 c = unit_get_cgroup_context(u);
519 /* So, here's the deal: when realizing the cgroups for this
520 * unit, we need to first create all parents, but there's more
521 * actually: for the weight-based controllers we also need to
522 * make sure that all our siblings (i.e. units that are in the
523 * same slice as we are) have cgroup too. Otherwise things
524 * would become very uneven as each of their processes would
525 * get as much resources as all our group together. This call
526 * will synchronously create the parent cgroups, but will
527 * defer work on the siblings to the next event loop
530 /* Add all sibling slices to the cgroup queue. */
531 unit_queue_siblings(u);
533 /* And realize this one now */
534 r = unit_realize_cgroup_now(u);
536 /* And apply the values */
538 cgroup_context_apply(c, u->cgroup_mask, u->cgroup_path);
543 void unit_destroy_cgroup(Unit *u) {
551 r = cg_trim_with_mask(u->cgroup_mask, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
553 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
555 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
557 free(u->cgroup_path);
558 u->cgroup_path = NULL;
559 u->cgroup_realized = false;
564 pid_t unit_search_main_pid(Unit *u) {
565 _cleanup_fclose_ FILE *f = NULL;
566 pid_t pid = 0, npid, mypid;
573 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
577 while (cg_read_pid(f, &npid) > 0) {
583 /* Ignore processes that aren't our kids */
584 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
588 /* Dang, there's more than one daemonized PID
589 in this group, so we don't know what process
590 is the main process. */
601 int manager_setup_cgroup(Manager *m) {
602 _cleanup_free_ char *path = NULL;
608 /* 0. Be nice to Ingo Molnar #628004 */
609 if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
610 log_warning("No control group support available, not creating root group.");
614 /* 1. Determine hierarchy */
615 free(m->cgroup_root);
616 m->cgroup_root = NULL;
618 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
620 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
624 /* Already in /system.slice? If so, let's cut this off again */
625 if (m->running_as == SYSTEMD_SYSTEM) {
626 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
631 /* And make sure to store away the root value without trailing
632 * slash, even for the root dir, so that we can easily prepend
634 if (streq(m->cgroup_root, "/"))
635 m->cgroup_root[0] = 0;
638 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
640 log_error("Cannot find cgroup mount point: %s", strerror(-r));
644 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
646 /* 3. Install agent */
647 if (m->running_as == SYSTEMD_SYSTEM) {
648 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
650 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
652 log_debug("Installed release agent.");
654 log_debug("Release agent already installed.");
657 /* 4. Realize the system slice and put us in there */
658 if (m->running_as == SYSTEMD_SYSTEM) {
659 a = strappenda(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
660 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, a, 0);
662 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
664 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
668 /* 5. And pin it, so that it cannot be unmounted */
669 if (m->pin_cgroupfs_fd >= 0)
670 close_nointr_nofail(m->pin_cgroupfs_fd);
672 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
674 log_error("Failed to open pin file: %m");
678 /* 6. Figure out which controllers are supported */
679 m->cgroup_supported = cg_mask_supported();
684 void manager_shutdown_cgroup(Manager *m, bool delete) {
687 /* We can't really delete the group, since we are in it. But
689 if (delete && m->cgroup_root)
690 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
692 if (m->pin_cgroupfs_fd >= 0) {
693 close_nointr_nofail(m->pin_cgroupfs_fd);
694 m->pin_cgroupfs_fd = -1;
697 free(m->cgroup_root);
698 m->cgroup_root = NULL;
701 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
708 u = hashmap_get(m->cgroup_unit, cgroup);
722 u = hashmap_get(m->cgroup_unit, p);
728 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
729 _cleanup_free_ char *cgroup = NULL;
737 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
741 return manager_get_unit_by_cgroup(m, cgroup);
744 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
751 u = manager_get_unit_by_cgroup(m, cgroup);
753 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
755 if (UNIT_VTABLE(u)->notify_cgroup_empty)
756 UNIT_VTABLE(u)->notify_cgroup_empty(u);
758 unit_add_to_gc_queue(u);
765 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
766 [CGROUP_AUTO] = "auto",
767 [CGROUP_CLOSED] = "closed",
768 [CGROUP_STRICT] = "strict",
771 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);