1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include "path-util.h"
26 #include "cgroup-util.h"
29 void cgroup_context_init(CGroupContext *c) {
32 /* Initialize everything to the kernel defaults, assuming the
33 * structure is preinitialized to 0 */
36 c->memory_limit = (uint64_t) -1;
37 c->blockio_weight = 1000;
40 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
44 LIST_REMOVE(CGroupDeviceAllow, device_allow, c->device_allow, a);
49 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
53 LIST_REMOVE(CGroupBlockIODeviceWeight, device_weights, c->blockio_device_weights, w);
58 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
62 LIST_REMOVE(CGroupBlockIODeviceBandwidth, device_bandwidths, c->blockio_device_bandwidths, b);
67 void cgroup_context_done(CGroupContext *c) {
70 while (c->blockio_device_weights)
71 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
73 while (c->blockio_device_bandwidths)
74 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
76 while (c->device_allow)
77 cgroup_context_free_device_allow(c, c->device_allow);
80 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
81 CGroupBlockIODeviceBandwidth *b;
82 CGroupBlockIODeviceWeight *w;
88 prefix = strempty(prefix);
91 "%sCPUAccounting=%s\n"
92 "%sBlockIOAccounting=%s\n"
93 "%sMemoryAccounting=%s\n"
95 "%sBlockIOWeight=%lu\n"
96 "%sMemoryLimit=%" PRIu64 "\n"
97 "%sDevicePolicy=%s\n",
98 prefix, yes_no(c->cpu_accounting),
99 prefix, yes_no(c->blockio_accounting),
100 prefix, yes_no(c->memory_accounting),
101 prefix, c->cpu_shares,
102 prefix, c->blockio_weight,
103 prefix, c->memory_limit,
104 prefix, cgroup_device_policy_to_string(c->device_policy));
106 LIST_FOREACH(device_allow, a, c->device_allow)
108 "%sDeviceAllow=%s %s%s%s\n",
111 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
113 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
115 "%sBlockIODeviceWeight=%s %lu",
120 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
121 char buf[FORMAT_BYTES_MAX];
126 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
128 format_bytes(buf, sizeof(buf), b->bandwidth));
132 static int lookup_blkio_device(const char *p, dev_t *dev) {
141 log_warning("Couldn't stat device %s: %m", p);
145 if (S_ISBLK(st.st_mode))
147 else if (major(st.st_dev) != 0) {
148 /* If this is not a device node then find the block
149 * device this file is stored on */
152 /* If this is a partition, try to get the originating
154 block_get_whole_disk(*dev, dev);
156 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
163 static int whitelist_device(const char *path, const char *node, const char *acc) {
164 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
171 if (stat(node, &st) < 0) {
172 log_warning("Couldn't stat device %s", node);
176 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
177 log_warning("%s is not a device.", node);
183 S_ISCHR(st.st_mode) ? 'c' : 'b',
184 major(st.st_rdev), minor(st.st_rdev),
187 r = cg_set_attribute("devices", path, "devices.allow", buf);
189 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
194 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
203 if (mask & CGROUP_CPU) {
204 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
206 sprintf(buf, "%lu\n", c->cpu_shares);
207 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
209 log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
212 if (mask & CGROUP_BLKIO) {
213 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
214 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
215 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
216 CGroupBlockIODeviceWeight *w;
217 CGroupBlockIODeviceBandwidth *b;
219 sprintf(buf, "%lu\n", c->blockio_weight);
220 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
222 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
224 /* FIXME: no way to reset this list */
225 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
228 r = lookup_blkio_device(w->path, &dev);
232 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
233 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
235 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
238 /* FIXME: no way to reset this list */
239 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
243 r = lookup_blkio_device(b->path, &dev);
247 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
249 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
250 r = cg_set_attribute("blkio", path, a, buf);
252 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
256 if (mask & CGROUP_MEMORY) {
257 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
258 if (c->memory_limit != (uint64_t) -1) {
259 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
260 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
262 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
265 log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
268 if (mask & CGROUP_DEVICE) {
269 CGroupDeviceAllow *a;
271 if (c->device_allow || c->device_policy != CGROUP_AUTO)
272 r = cg_set_attribute("devices", path, "devices.deny", "a");
274 r = cg_set_attribute("devices", path, "devices.allow", "a");
276 log_error("Failed to reset devices.list on %s: %s", path, strerror(-r));
278 if (c->device_policy == CGROUP_CLOSED ||
279 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
280 static const char auto_devices[] =
284 "/dev/random\0" "rw\0"
285 "/dev/urandom\0" "rw\0";
289 NULSTR_FOREACH_PAIR(x, y, auto_devices)
290 whitelist_device(path, x, y);
293 LIST_FOREACH(device_allow, a, c->device_allow) {
308 whitelist_device(path, a->path, acc);
313 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
314 CGroupControllerMask mask = 0;
316 /* Figure out which controllers we need */
318 if (c->cpu_accounting || c->cpu_shares != 1024)
319 mask |= CGROUP_CPUACCT | CGROUP_CPU;
321 if (c->blockio_accounting ||
322 c->blockio_weight != 1000 ||
323 c->blockio_device_weights ||
324 c->blockio_device_bandwidths)
325 mask |= CGROUP_BLKIO;
327 if (c->memory_accounting ||
328 c->memory_limit != (uint64_t) -1)
329 mask |= CGROUP_MEMORY;
331 if (c->device_allow || c->device_policy != CGROUP_AUTO)
332 mask |= CGROUP_DEVICE;
337 static CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
340 c = unit_get_cgroup_context(u);
344 return cgroup_context_get_mask(c);
347 static CGroupControllerMask unit_get_members_mask(Unit *u) {
348 CGroupControllerMask mask = 0;
354 SET_FOREACH(m, u->dependencies[UNIT_BEFORE], i) {
356 if (UNIT_DEREF(m->slice) != u)
359 mask |= unit_get_cgroup_mask(m) | unit_get_members_mask(m);
365 static CGroupControllerMask unit_get_siblings_mask(Unit *u) {
368 if (!UNIT_ISSET(u->slice))
371 /* Sibling propagation is only relevant for weight-based
372 * controllers, so let's mask out everything else */
373 return unit_get_members_mask(UNIT_DEREF(u->slice)) &
374 (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
377 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
380 bool is_in_hash = false;
384 path = unit_default_cgroup_path(u);
388 r = hashmap_put(u->manager->cgroup_unit, path, u);
393 log_error("cgroup %s exists already: %s", path, strerror(-r));
398 /* First, create our own group */
399 r = cg_create_with_mask(mask, path);
401 log_error("Failed to create cgroup %s: %s", path, strerror(-r));
403 /* Then, possibly move things over */
404 if (u->cgroup_path && !streq(path, u->cgroup_path)) {
405 r = cg_migrate_with_mask(mask, u->cgroup_path, path);
407 log_error("Failed to migrate cgroup %s: %s", path, strerror(-r));
411 /* And remember the new data */
412 free(u->cgroup_path);
413 u->cgroup_path = path;
416 u->cgroup_realized = true;
417 u->cgroup_mask = mask;
422 static int unit_realize_cgroup_now(Unit *u) {
423 CGroupControllerMask mask;
427 if (u->in_cgroup_queue) {
428 LIST_REMOVE(Unit, cgroup_queue, u->manager->cgroup_queue, u);
429 u->in_cgroup_queue = false;
432 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
433 mask &= u->manager->cgroup_supported;
435 if (u->cgroup_realized &&
436 u->cgroup_mask == mask)
439 /* First, realize parents */
440 if (UNIT_ISSET(u->slice))
441 unit_realize_cgroup_now(UNIT_DEREF(u->slice));
443 /* And then do the real work */
444 return unit_create_cgroups(u, mask);
447 static void unit_add_to_cgroup_queue(Unit *u) {
449 if (u->in_cgroup_queue)
452 LIST_PREPEND(Unit, cgroup_queue, u->manager->cgroup_queue, u);
453 u->in_cgroup_queue = true;
456 unsigned manager_dispatch_cgroup_queue(Manager *m) {
460 while ((i = m->cgroup_queue)) {
461 assert(i->in_cgroup_queue);
463 if (unit_realize_cgroup_now(i) >= 0)
464 cgroup_context_apply(unit_get_cgroup_context(i), i->cgroup_mask, i->cgroup_path);
472 static void unit_queue_siblings(Unit *u) {
475 /* This adds the siblings of the specified unit and the
476 * siblings of all parent units to the cgroup queue. (But
477 * neither the specified unit itself nor the parents.) */
479 while ((slice = UNIT_DEREF(u->slice))) {
483 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
487 if (UNIT_DEREF(m->slice) != slice)
490 unit_add_to_cgroup_queue(m);
497 int unit_realize_cgroup(Unit *u) {
503 c = unit_get_cgroup_context(u);
507 /* So, here's the deal: when realizing the cgroups for this
508 * unit, we need to first create all parents, but there's more
509 * actually: for the weight-based controllers we also need to
510 * make sure that all our siblings (i.e. units that are in the
511 * same slice as we are) have cgroup too. Otherwise things
512 * would become very uneven as each of their processes would
513 * get as much resources as all our group together. This call
514 * will synchronously create the parent cgroups, but will
515 * defer work on the siblings to the next event loop
518 /* Add all sibling slices to the cgroup queue. */
519 unit_queue_siblings(u);
521 /* And realize this one now */
522 r = unit_realize_cgroup_now(u);
524 /* And apply the values */
526 cgroup_context_apply(c, u->cgroup_mask, u->cgroup_path);
531 void unit_destroy_cgroup(Unit *u) {
539 r = cg_trim_with_mask(u->cgroup_mask, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
541 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
543 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
545 free(u->cgroup_path);
546 u->cgroup_path = NULL;
547 u->cgroup_realized = false;
552 pid_t unit_search_main_pid(Unit *u) {
553 _cleanup_fclose_ FILE *f = NULL;
554 pid_t pid = 0, npid, mypid;
561 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
565 while (cg_read_pid(f, &npid) > 0) {
571 /* Ignore processes that aren't our kids */
572 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
576 /* Dang, there's more than one daemonized PID
577 in this group, so we don't know what process
578 is the main process. */
589 int manager_setup_cgroup(Manager *m) {
590 _cleanup_free_ char *path = NULL;
596 /* 0. Be nice to Ingo Molnar #628004 */
597 if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
598 log_warning("No control group support available, not creating root group.");
602 /* 1. Determine hierarchy */
603 free(m->cgroup_root);
604 m->cgroup_root = NULL;
606 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
608 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
612 /* Already in /system.slice? If so, let's cut this off again */
613 if (m->running_as == SYSTEMD_SYSTEM) {
614 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
619 /* And make sure to store away the root value without trailing
620 * slash, even for the root dir, so that we can easily prepend
622 if (streq(m->cgroup_root, "/"))
623 m->cgroup_root[0] = 0;
626 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
628 log_error("Cannot find cgroup mount point: %s", strerror(-r));
632 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
634 /* 3. Install agent */
635 if (m->running_as == SYSTEMD_SYSTEM) {
636 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
638 log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
640 log_debug("Installed release agent.");
642 log_debug("Release agent already installed.");
645 /* 4. Realize the system slice and put us in there */
646 if (m->running_as == SYSTEMD_SYSTEM) {
647 a = strappenda(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
648 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, a, 0);
650 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
652 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
656 /* 5. And pin it, so that it cannot be unmounted */
657 if (m->pin_cgroupfs_fd >= 0)
658 close_nointr_nofail(m->pin_cgroupfs_fd);
660 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
662 log_error("Failed to open pin file: %m");
666 /* 6. Figure out which controllers are supported */
667 m->cgroup_supported = cg_mask_supported();
672 void manager_shutdown_cgroup(Manager *m, bool delete) {
675 /* We can't really delete the group, since we are in it. But
677 if (delete && m->cgroup_root)
678 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
680 if (m->pin_cgroupfs_fd >= 0) {
681 close_nointr_nofail(m->pin_cgroupfs_fd);
682 m->pin_cgroupfs_fd = -1;
685 free(m->cgroup_root);
686 m->cgroup_root = NULL;
689 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
696 u = hashmap_get(m->cgroup_unit, cgroup);
710 u = hashmap_get(m->cgroup_unit, p);
716 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
717 _cleanup_free_ char *cgroup = NULL;
725 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
729 return manager_get_unit_by_cgroup(m, cgroup);
732 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
739 u = manager_get_unit_by_cgroup(m, cgroup);
741 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
743 if (UNIT_VTABLE(u)->notify_cgroup_empty)
744 UNIT_VTABLE(u)->notify_cgroup_empty(u);
746 unit_add_to_gc_queue(u);
753 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
754 [CGROUP_AUTO] = "auto",
755 [CGROUP_CLOSED] = "closed",
756 [CGROUP_STRICT] = "strict",
759 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);