src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23
  24 #include "path-util.h"
  25 #include "special.h"
  26 #include "cgroup-util.h"
  27 #include "cgroup.h"
  28
  29 void cgroup_context_init(CGroupContext *c) {
  30         assert(c);
  31
  32         /* Initialize everything to the kernel defaults, assuming the
  33          * structure is preinitialized to 0 */
  34
  35         c->cpu_shares = 1024;
  36         c->memory_limit = (uint64_t) -1;
  37         c->blockio_weight = 1000;
  38 }
  39
  40 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  41         assert(c);
  42         assert(a);
  43
  44         LIST_REMOVE(device_allow, c->device_allow, a);
  45         free(a->path);
  46         free(a);
  47 }
  48
  49 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  50         assert(c);
  51         assert(w);
  52
  53         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  54         free(w->path);
  55         free(w);
  56 }
  57
  58 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  59         assert(c);
  60         assert(b);
  61
  62         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  63         free(b->path);
  64         free(b);
  65 }
  66
  67 void cgroup_context_done(CGroupContext *c) {
  68         assert(c);
  69
  70         while (c->blockio_device_weights)
  71                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  72
  73         while (c->blockio_device_bandwidths)
  74                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  75
  76         while (c->device_allow)
  77                 cgroup_context_free_device_allow(c, c->device_allow);
  78 }
  79
  80 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  81         CGroupBlockIODeviceBandwidth *b;
  82         CGroupBlockIODeviceWeight *w;
  83         CGroupDeviceAllow *a;
  84
  85         assert(c);
  86         assert(f);
  87
  88         prefix = strempty(prefix);
  89
  90         fprintf(f,
  91                 "%sCPUAccounting=%s\n"
  92                 "%sBlockIOAccounting=%s\n"
  93                 "%sMemoryAccounting=%s\n"
  94                 "%sCPUShares=%lu\n"
  95                 "%sBlockIOWeight=%lu\n"
  96                 "%sMemoryLimit=%" PRIu64 "\n"
  97                 "%sDevicePolicy=%s\n",
  98                 prefix, yes_no(c->cpu_accounting),
  99                 prefix, yes_no(c->blockio_accounting),
 100                 prefix, yes_no(c->memory_accounting),
 101                 prefix, c->cpu_shares,
 102                 prefix, c->blockio_weight,
 103                 prefix, c->memory_limit,
 104                 prefix, cgroup_device_policy_to_string(c->device_policy));
 105
 106         LIST_FOREACH(device_allow, a, c->device_allow)
 107                 fprintf(f,
 108                         "%sDeviceAllow=%s %s%s%s\n",
 109                         prefix,
 110                         a->path,
 111                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 112
 113         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 114                 fprintf(f,
 115                         "%sBlockIODeviceWeight=%s %lu",
 116                         prefix,
 117                         w->path,
 118                         w->weight);
 119
 120         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 121                 char buf[FORMAT_BYTES_MAX];
 122
 123                 fprintf(f,
 124                         "%s%s=%s %s\n",
 125                         prefix,
 126                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 127                         b->path,
 128                         format_bytes(buf, sizeof(buf), b->bandwidth));
 129         }
 130 }
 131
 132 static int lookup_blkio_device(const char *p, dev_t *dev) {
 133         struct stat st;
 134         int r;
 135
 136         assert(p);
 137         assert(dev);
 138
 139         r = stat(p, &st);
 140         if (r < 0) {
 141                 log_warning("Couldn't stat device %s: %m", p);
 142                 return -errno;
 143         }
 144
 145         if (S_ISBLK(st.st_mode))
 146                 *dev = st.st_rdev;
 147         else if (major(st.st_dev) != 0) {
 148                 /* If this is not a device node then find the block
 149                  * device this file is stored on */
 150                 *dev = st.st_dev;
 151
 152                 /* If this is a partition, try to get the originating
 153                  * block device */
 154                 block_get_whole_disk(*dev, dev);
 155         } else {
 156                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 157                 return -ENODEV;
 158         }
 159
 160         return 0;
 161 }
 162
 163 static int whitelist_device(const char *path, const char *node, const char *acc) {
 164         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 165         struct stat st;
 166         int r;
 167
 168         assert(path);
 169         assert(acc);
 170
 171         if (stat(node, &st) < 0) {
 172                 log_warning("Couldn't stat device %s", node);
 173                 return -errno;
 174         }
 175
 176         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 177                 log_warning("%s is not a device.", node);
 178                 return -ENODEV;
 179         }
 180
 181         sprintf(buf,
 182                 "%c %u:%u %s",
 183                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 184                 major(st.st_rdev), minor(st.st_rdev),
 185                 acc);
 186
 187         r = cg_set_attribute("devices", path, "devices.allow", buf);
 188         if (r < 0)
 189                 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 190
 191         return r;
 192 }
 193
 194 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 195         _cleanup_fclose_ FILE *f = NULL;
 196         char line[LINE_MAX];
 197         bool good = false;
 198         int r;
 199
 200         assert(path);
 201         assert(acc);
 202         assert(type == 'b' || type == 'c');
 203
 204         f = fopen("/proc/devices", "re");
 205         if (!f) {
 206                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 207                 return -errno;
 208         }
 209
 210         FOREACH_LINE(line, f, goto fail) {
 211                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 212                 unsigned maj;
 213
 214                 truncate_nl(line);
 215
 216                 if (type == 'c' && streq(line, "Character devices:")) {
 217                         good = true;
 218                         continue;
 219                 }
 220
 221                 if (type == 'b' && streq(line, "Block devices:")) {
 222                         good = true;
 223                         continue;
 224                 }
 225
 226                 if (isempty(line)) {
 227                         good = false;
 228                         continue;
 229                 }
 230
 231                 if (!good)
 232                         continue;
 233
 234                 p = strstrip(line);
 235
 236                 w = strpbrk(p, WHITESPACE);
 237                 if (!w)
 238                         continue;
 239                 *w = 0;
 240
 241                 r = safe_atou(p, &maj);
 242                 if (r < 0)
 243                         continue;
 244                 if (maj <= 0)
 245                         continue;
 246
 247                 w++;
 248                 w += strspn(w, WHITESPACE);
 249                 if (!streq(w, name))
 250                         continue;
 251
 252                 sprintf(buf,
 253                         "%c %u:* %s",
 254                         type,
 255                         maj,
 256                         acc);
 257
 258                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 259                 if (r < 0)
 260                         log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 261         }
 262
 263         return 0;
 264
 265 fail:
 266         log_warning("Failed to read /proc/devices: %m");
 267         return -errno;
 268 }
 269
 270 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
 271         bool is_root;
 272         int r;
 273
 274         assert(c);
 275         assert(path);
 276
 277         if (mask == 0)
 278                 return;
 279
 280         /* Some cgroup attributes are not support on the root cgroup,
 281          * hence silently ignore */
 282         is_root = isempty(path) || path_equal(path, "/");
 283
 284         if ((mask & CGROUP_CPU) && !is_root) {
 285                 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
 286
 287                 sprintf(buf, "%lu\n", c->cpu_shares);
 288                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 289                 if (r < 0)
 290                         log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
 291         }
 292
 293         if (mask & CGROUP_BLKIO) {
 294                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 295                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 296                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 297                 CGroupBlockIODeviceWeight *w;
 298                 CGroupBlockIODeviceBandwidth *b;
 299
 300                 if (!is_root) {
 301                         sprintf(buf, "%lu\n", c->blockio_weight);
 302                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 303                         if (r < 0)
 304                                 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
 305
 306                         /* FIXME: no way to reset this list */
 307                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 308                                 dev_t dev;
 309
 310                                 r = lookup_blkio_device(w->path, &dev);
 311                                 if (r < 0)
 312                                         continue;
 313
 314                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 315                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 316                                 if (r < 0)
 317                                         log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 318                         }
 319                 }
 320
 321                 /* FIXME: no way to reset this list */
 322                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 323                         const char *a;
 324                         dev_t dev;
 325
 326                         r = lookup_blkio_device(b->path, &dev);
 327                         if (r < 0)
 328                                 continue;
 329
 330                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 331
 332                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 333                         r = cg_set_attribute("blkio", path, a, buf);
 334                         if (r < 0)
 335                                 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
 336                 }
 337         }
 338
 339         if (mask & CGROUP_MEMORY) {
 340                 if (c->memory_limit != (uint64_t) -1) {
 341                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 342
 343                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 344                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 345                 } else
 346                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 347
 348                 if (r < 0)
 349                         log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 350         }
 351
 352         if ((mask & CGROUP_DEVICE) && !is_root) {
 353                 CGroupDeviceAllow *a;
 354
 355                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 356                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 357                 else
 358                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 359                 if (r < 0)
 360                         log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
 361
 362                 if (c->device_policy == CGROUP_CLOSED ||
 363                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 364                         static const char auto_devices[] =
 365                                 "/dev/null\0" "rw\0"
 366                                 "/dev/zero\0" "rw\0"
 367                                 "/dev/full\0" "rw\0"
 368                                 "/dev/random\0" "rw\0"
 369                                 "/dev/urandom\0" "rw\0";
 370
 371                         const char *x, *y;
 372
 373                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 374                                 whitelist_device(path, x, y);
 375                 }
 376
 377                 LIST_FOREACH(device_allow, a, c->device_allow) {
 378                         char acc[4];
 379                         unsigned k = 0;
 380
 381                         if (a->r)
 382                                 acc[k++] = 'r';
 383                         if (a->w)
 384                                 acc[k++] = 'w';
 385                         if (a->m)
 386                                 acc[k++] = 'm';
 387
 388                         if (k == 0)
 389                                 continue;
 390
 391                         acc[k++] = 0;
 392
 393                         if (startswith(a->path, "/dev/"))
 394                                 whitelist_device(path, a->path, acc);
 395                         else if (startswith(a->path, "block-"))
 396                                 whitelist_major(path, a->path + 6, 'b', acc);
 397                         else if (startswith(a->path, "char-"))
 398                                 whitelist_major(path, a->path + 5, 'c', acc);
 399                         else
 400                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 401                 }
 402         }
 403 }
 404
 405 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 406         CGroupControllerMask mask = 0;
 407
 408         /* Figure out which controllers we need */
 409
 410         if (c->cpu_accounting || c->cpu_shares != 1024)
 411                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 412
 413         if (c->blockio_accounting ||
 414             c->blockio_weight != 1000 ||
 415             c->blockio_device_weights ||
 416             c->blockio_device_bandwidths)
 417                 mask |= CGROUP_BLKIO;
 418
 419         if (c->memory_accounting ||
 420             c->memory_limit != (uint64_t) -1)
 421                 mask |= CGROUP_MEMORY;
 422
 423         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 424                 mask |= CGROUP_DEVICE;
 425
 426         return mask;
 427 }
 428
 429 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 430         CGroupContext *c;
 431
 432         c = unit_get_cgroup_context(u);
 433         if (!c)
 434                 return 0;
 435
 436         return cgroup_context_get_mask(c);
 437 }
 438
 439 CGroupControllerMask unit_get_members_mask(Unit *u) {
 440         assert(u);
 441
 442         if (u->cgroup_members_mask_valid)
 443                 return u->cgroup_members_mask;
 444
 445         u->cgroup_members_mask = 0;
 446
 447         if (u->type == UNIT_SLICE) {
 448                 Unit *member;
 449                 Iterator i;
 450
 451                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 452
 453                         if (member == u)
 454                                 continue;
 455
 456                         if (UNIT_DEREF(member->slice) != u)
 457                                 continue;
 458
 459                         u->cgroup_members_mask |=
 460                                 unit_get_cgroup_mask(member) |
 461                                 unit_get_members_mask(member);
 462                 }
 463         }
 464
 465         u->cgroup_members_mask_valid = true;
 466         return u->cgroup_members_mask;
 467 }
 468
 469 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 470         CGroupControllerMask m;
 471
 472         assert(u);
 473
 474         if (UNIT_ISSET(u->slice))
 475                 m = unit_get_members_mask(UNIT_DEREF(u->slice));
 476         else
 477                 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 478
 479         /* Sibling propagation is only relevant for weight-based
 480          * controllers, so let's mask out everything else */
 481         return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
 482 }
 483
 484 CGroupControllerMask unit_get_target_mask(Unit *u) {
 485         CGroupControllerMask mask;
 486
 487         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 488         mask &= u->manager->cgroup_supported;
 489
 490         return mask;
 491 }
 492
 493 /* Recurse from a unit up through its containing slices, propagating
 494  * mask bits upward. A unit is also member of itself. */
 495 void unit_update_cgroup_members_masks(Unit *u) {
 496         CGroupControllerMask m;
 497         bool more;
 498
 499         assert(u);
 500
 501         /* Calculate subtree mask */
 502         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 503
 504         /* See if anything changed from the previous invocation. If
 505          * not, we're done. */
 506         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 507                 return;
 508
 509         more =
 510                 u->cgroup_subtree_mask_valid &&
 511                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 512                 ((~m & u->cgroup_subtree_mask) == 0);
 513
 514         u->cgroup_subtree_mask = m;
 515         u->cgroup_subtree_mask_valid = true;
 516
 517         if (UNIT_ISSET(u->slice)) {
 518                 Unit *s = UNIT_DEREF(u->slice);
 519
 520                 if (more)
 521                         /* There's more set now than before. We
 522                          * propagate the new mask to the parent's mask
 523                          * (not caring if it actually was valid or
 524                          * not). */
 525
 526                         s->cgroup_members_mask |= m;
 527
 528                 else
 529                         /* There's less set now than before (or we
 530                          * don't know), we need to recalculate
 531                          * everything, so let's invalidate the
 532                          * parent's members mask */
 533
 534                         s->cgroup_members_mask_valid = false;
 535
 536                 /* And now make sure that this change also hits our
 537                  * grandparents */
 538                 unit_update_cgroup_members_masks(s);
 539         }
 540 }
 541
 542 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 543         Unit *u = userdata;
 544
 545         assert(mask != 0);
 546         assert(u);
 547
 548         while (u) {
 549                 if (u->cgroup_path &&
 550                     u->cgroup_realized &&
 551                     (u->cgroup_realized_mask & mask) == mask)
 552                         return u->cgroup_path;
 553
 554                 u = UNIT_DEREF(u->slice);
 555         }
 556
 557         return NULL;
 558 }
 559
 560 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 561         _cleanup_free_ char *path = NULL;
 562         int r;
 563
 564         assert(u);
 565
 566         path = unit_default_cgroup_path(u);
 567         if (!path)
 568                 return log_oom();
 569
 570         r = hashmap_put(u->manager->cgroup_unit, path, u);
 571         if (r < 0) {
 572                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 573                 return r;
 574         }
 575         if (r > 0) {
 576                 u->cgroup_path = path;
 577                 path = NULL;
 578         }
 579
 580         /* First, create our own group */
 581         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 582         if (r < 0) {
 583                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 584                 return r;
 585         }
 586
 587         /* Keep track that this is now realized */
 588         u->cgroup_realized = true;
 589         u->cgroup_realized_mask = mask;
 590
 591         /* Then, possibly move things over */
 592         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 593         if (r < 0)
 594                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 595
 596         return 0;
 597 }
 598
 599 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 600         assert(u);
 601
 602         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 603 }
 604
 605 /* Check if necessary controllers and attributes for a unit are in place.
 606  *
 607  * If so, do nothing.
 608  * If not, create paths, move processes over, and set attributes.
 609  *
 610  * Returns 0 on success and < 0 on failure. */
 611 static int unit_realize_cgroup_now(Unit *u) {
 612         CGroupControllerMask mask;
 613         int r;
 614
 615         assert(u);
 616
 617         if (u->in_cgroup_queue) {
 618                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 619                 u->in_cgroup_queue = false;
 620         }
 621
 622         mask = unit_get_target_mask(u);
 623
 624         if (unit_has_mask_realized(u, mask))
 625                 return 0;
 626
 627         /* First, realize parents */
 628         if (UNIT_ISSET(u->slice)) {
 629                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
 630                 if (r < 0)
 631                         return r;
 632         }
 633
 634         /* And then do the real work */
 635         r = unit_create_cgroups(u, mask);
 636         if (r < 0)
 637                 return r;
 638
 639         /* Finally, apply the necessary attributes. */
 640         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
 641
 642         return 0;
 643 }
 644
 645 static void unit_add_to_cgroup_queue(Unit *u) {
 646
 647         if (u->in_cgroup_queue)
 648                 return;
 649
 650         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 651         u->in_cgroup_queue = true;
 652 }
 653
 654 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 655         Unit *i;
 656         unsigned n = 0;
 657         int r;
 658
 659         while ((i = m->cgroup_queue)) {
 660                 assert(i->in_cgroup_queue);
 661
 662                 r = unit_realize_cgroup_now(i);
 663                 if (r < 0)
 664                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 665
 666                 n++;
 667         }
 668
 669         return n;
 670 }
 671
 672 static void unit_queue_siblings(Unit *u) {
 673         Unit *slice;
 674
 675         /* This adds the siblings of the specified unit and the
 676          * siblings of all parent units to the cgroup queue. (But
 677          * neither the specified unit itself nor the parents.) */
 678
 679         while ((slice = UNIT_DEREF(u->slice))) {
 680                 Iterator i;
 681                 Unit *m;
 682
 683                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 684                         if (m == u)
 685                                 continue;
 686
 687                         /* Skip units that have a dependency on the slice
 688                          * but aren't actually in it. */
 689                         if (UNIT_DEREF(m->slice) != slice)
 690                                 continue;
 691
 692                         /* No point in doing cgroup application for units
 693                          * without active processes. */
 694                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 695                                 continue;
 696
 697                         /* If the unit doesn't need any new controllers
 698                          * and has current ones realized, it doesn't need
 699                          * any changes. */
 700                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 701                                 continue;
 702
 703                         unit_add_to_cgroup_queue(m);
 704                 }
 705
 706                 u = slice;
 707         }
 708 }
 709
 710 int unit_realize_cgroup(Unit *u) {
 711         CGroupContext *c;
 712
 713         assert(u);
 714
 715         c = unit_get_cgroup_context(u);
 716         if (!c)
 717                 return 0;
 718
 719         /* So, here's the deal: when realizing the cgroups for this
 720          * unit, we need to first create all parents, but there's more
 721          * actually: for the weight-based controllers we also need to
 722          * make sure that all our siblings (i.e. units that are in the
 723          * same slice as we are) have cgroups, too. Otherwise, things
 724          * would become very uneven as each of their processes would
 725          * get as much resources as all our group together. This call
 726          * will synchronously create the parent cgroups, but will
 727          * defer work on the siblings to the next event loop
 728          * iteration. */
 729
 730         /* Add all sibling slices to the cgroup queue. */
 731         unit_queue_siblings(u);
 732
 733         /* And realize this one now (and apply the values) */
 734         return unit_realize_cgroup_now(u);
 735 }
 736
 737 void unit_destroy_cgroup(Unit *u) {
 738         int r;
 739
 740         assert(u);
 741
 742         if (!u->cgroup_path)
 743                 return;
 744
 745         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 746         if (r < 0)
 747                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 748
 749         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 750
 751         free(u->cgroup_path);
 752         u->cgroup_path = NULL;
 753         u->cgroup_realized = false;
 754         u->cgroup_realized_mask = 0;
 755
 756 }
 757
 758 pid_t unit_search_main_pid(Unit *u) {
 759         _cleanup_fclose_ FILE *f = NULL;
 760         pid_t pid = 0, npid, mypid;
 761
 762         assert(u);
 763
 764         if (!u->cgroup_path)
 765                 return 0;
 766
 767         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 768                 return 0;
 769
 770         mypid = getpid();
 771         while (cg_read_pid(f, &npid) > 0)  {
 772                 pid_t ppid;
 773
 774                 if (npid == pid)
 775                         continue;
 776
 777                 /* Ignore processes that aren't our kids */
 778                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 779                         continue;
 780
 781                 if (pid != 0) {
 782                         /* Dang, there's more than one daemonized PID
 783                         in this group, so we don't know what process
 784                         is the main process. */
 785                         pid = 0;
 786                         break;
 787                 }
 788
 789                 pid = npid;
 790         }
 791
 792         return pid;
 793 }
 794
 795 int manager_setup_cgroup(Manager *m) {
 796         _cleanup_free_ char *path = NULL;
 797         char *e;
 798         int r;
 799
 800         assert(m);
 801
 802         /* 0. Be nice to Ingo Molnar #628004 */
 803         if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
 804                 log_warning("No control group support available, not creating root group.");
 805                 return 0;
 806         }
 807
 808         /* 1. Determine hierarchy */
 809         free(m->cgroup_root);
 810         m->cgroup_root = NULL;
 811
 812         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 813         if (r < 0) {
 814                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 815                 return r;
 816         }
 817
 818         /* LEGACY: Already in /system.slice? If so, let's cut this
 819          * off. This is to support live upgrades from older systemd
 820          * versions where PID 1 was moved there. */
 821         if (m->running_as == SYSTEMD_SYSTEM) {
 822                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 823                 if (!e)
 824                         e = endswith(m->cgroup_root, "/system");
 825                 if (e)
 826                         *e = 0;
 827         }
 828
 829         /* And make sure to store away the root value without trailing
 830          * slash, even for the root dir, so that we can easily prepend
 831          * it everywhere. */
 832         if (streq(m->cgroup_root, "/"))
 833                 m->cgroup_root[0] = 0;
 834
 835         /* 2. Show data */
 836         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 837         if (r < 0) {
 838                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 839                 return r;
 840         }
 841
 842         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 843
 844         /* 3. Install agent */
 845         if (m->running_as == SYSTEMD_SYSTEM) {
 846                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 847                 if (r < 0)
 848                         log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 849                 else if (r > 0)
 850                         log_debug("Installed release agent.");
 851                 else
 852                         log_debug("Release agent already installed.");
 853         }
 854
 855         /* 4. Make sure we are in the root cgroup */
 856         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 857         if (r < 0) {
 858                 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 859                 return r;
 860         }
 861
 862         /* 5. And pin it, so that it cannot be unmounted */
 863         if (m->pin_cgroupfs_fd >= 0)
 864                 close_nointr_nofail(m->pin_cgroupfs_fd);
 865
 866         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 867         if (r < 0) {
 868                 log_error("Failed to open pin file: %m");
 869                 return -errno;
 870         }
 871
 872         /* 6. Figure out which controllers are supported */
 873         m->cgroup_supported = cg_mask_supported();
 874
 875         /* 7.  Always enable hierarchial support if it exists... */
 876         cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 877
 878         return 0;
 879 }
 880
 881 void manager_shutdown_cgroup(Manager *m, bool delete) {
 882         assert(m);
 883
 884         /* We can't really delete the group, since we are in it. But
 885          * let's trim it. */
 886         if (delete && m->cgroup_root)
 887                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 888
 889         if (m->pin_cgroupfs_fd >= 0) {
 890                 close_nointr_nofail(m->pin_cgroupfs_fd);
 891                 m->pin_cgroupfs_fd = -1;
 892         }
 893
 894         free(m->cgroup_root);
 895         m->cgroup_root = NULL;
 896 }
 897
 898 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 899         char *p;
 900         Unit *u;
 901
 902         assert(m);
 903         assert(cgroup);
 904
 905         u = hashmap_get(m->cgroup_unit, cgroup);
 906         if (u)
 907                 return u;
 908
 909         p = strdupa(cgroup);
 910         for (;;) {
 911                 char *e;
 912
 913                 e = strrchr(p, '/');
 914                 if (e == p || !e)
 915                         return NULL;
 916
 917                 *e = 0;
 918
 919                 u = hashmap_get(m->cgroup_unit, p);
 920                 if (u)
 921                         return u;
 922         }
 923 }
 924
 925 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 926         _cleanup_free_ char *cgroup = NULL;
 927         int r;
 928
 929         assert(m);
 930
 931         if (pid <= 1)
 932                 return NULL;
 933
 934         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 935         if (r < 0)
 936                 return NULL;
 937
 938         return manager_get_unit_by_cgroup(m, cgroup);
 939 }
 940
 941 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 942         Unit *u;
 943         int r;
 944
 945         assert(m);
 946         assert(cgroup);
 947
 948         u = manager_get_unit_by_cgroup(m, cgroup);
 949         if (u) {
 950                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
 951                 if (r > 0) {
 952                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
 953                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
 954
 955                         unit_add_to_gc_queue(u);
 956                 }
 957         }
 958
 959         return 0;
 960 }
 961
 962 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
 963         [CGROUP_AUTO] = "auto",
 964         [CGROUP_CLOSED] = "closed",
 965         [CGROUP_STRICT] = "strict",
 966 };
 967
 968 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);