src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23
  24 #include "path-util.h"
  25 #include "special.h"
  26 #include "cgroup-util.h"
  27 #include "cgroup.h"
  28
  29 void cgroup_context_init(CGroupContext *c) {
  30         assert(c);
  31
  32         /* Initialize everything to the kernel defaults, assuming the
  33          * structure is preinitialized to 0 */
  34
  35         c->cpu_shares = 1024;
  36         c->memory_limit = (uint64_t) -1;
  37         c->blockio_weight = 1000;
  38 }
  39
  40 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  41         assert(c);
  42         assert(a);
  43
  44         LIST_REMOVE(device_allow, c->device_allow, a);
  45         free(a->path);
  46         free(a);
  47 }
  48
  49 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  50         assert(c);
  51         assert(w);
  52
  53         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  54         free(w->path);
  55         free(w);
  56 }
  57
  58 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  59         assert(c);
  60         assert(b);
  61
  62         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  63         free(b->path);
  64         free(b);
  65 }
  66
  67 void cgroup_context_done(CGroupContext *c) {
  68         assert(c);
  69
  70         while (c->blockio_device_weights)
  71                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  72
  73         while (c->blockio_device_bandwidths)
  74                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  75
  76         while (c->device_allow)
  77                 cgroup_context_free_device_allow(c, c->device_allow);
  78 }
  79
  80 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  81         CGroupBlockIODeviceBandwidth *b;
  82         CGroupBlockIODeviceWeight *w;
  83         CGroupDeviceAllow *a;
  84
  85         assert(c);
  86         assert(f);
  87
  88         prefix = strempty(prefix);
  89
  90         fprintf(f,
  91                 "%sCPUAccounting=%s\n"
  92                 "%sBlockIOAccounting=%s\n"
  93                 "%sMemoryAccounting=%s\n"
  94                 "%sCPUShares=%lu\n"
  95                 "%sBlockIOWeight=%lu\n"
  96                 "%sMemoryLimit=%" PRIu64 "\n"
  97                 "%sDevicePolicy=%s\n",
  98                 prefix, yes_no(c->cpu_accounting),
  99                 prefix, yes_no(c->blockio_accounting),
 100                 prefix, yes_no(c->memory_accounting),
 101                 prefix, c->cpu_shares,
 102                 prefix, c->blockio_weight,
 103                 prefix, c->memory_limit,
 104                 prefix, cgroup_device_policy_to_string(c->device_policy));
 105
 106         LIST_FOREACH(device_allow, a, c->device_allow)
 107                 fprintf(f,
 108                         "%sDeviceAllow=%s %s%s%s\n",
 109                         prefix,
 110                         a->path,
 111                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 112
 113         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 114                 fprintf(f,
 115                         "%sBlockIODeviceWeight=%s %lu",
 116                         prefix,
 117                         w->path,
 118                         w->weight);
 119
 120         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 121                 char buf[FORMAT_BYTES_MAX];
 122
 123                 fprintf(f,
 124                         "%s%s=%s %s\n",
 125                         prefix,
 126                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 127                         b->path,
 128                         format_bytes(buf, sizeof(buf), b->bandwidth));
 129         }
 130 }
 131
 132 static int lookup_blkio_device(const char *p, dev_t *dev) {
 133         struct stat st;
 134         int r;
 135
 136         assert(p);
 137         assert(dev);
 138
 139         r = stat(p, &st);
 140         if (r < 0) {
 141                 log_warning("Couldn't stat device %s: %m", p);
 142                 return -errno;
 143         }
 144
 145         if (S_ISBLK(st.st_mode))
 146                 *dev = st.st_rdev;
 147         else if (major(st.st_dev) != 0) {
 148                 /* If this is not a device node then find the block
 149                  * device this file is stored on */
 150                 *dev = st.st_dev;
 151
 152                 /* If this is a partition, try to get the originating
 153                  * block device */
 154                 block_get_whole_disk(*dev, dev);
 155         } else {
 156                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 157                 return -ENODEV;
 158         }
 159
 160         return 0;
 161 }
 162
 163 static int whitelist_device(const char *path, const char *node, const char *acc) {
 164         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 165         struct stat st;
 166         int r;
 167
 168         assert(path);
 169         assert(acc);
 170
 171         if (stat(node, &st) < 0) {
 172                 log_warning("Couldn't stat device %s", node);
 173                 return -errno;
 174         }
 175
 176         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 177                 log_warning("%s is not a device.", node);
 178                 return -ENODEV;
 179         }
 180
 181         sprintf(buf,
 182                 "%c %u:%u %s",
 183                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 184                 major(st.st_rdev), minor(st.st_rdev),
 185                 acc);
 186
 187         r = cg_set_attribute("devices", path, "devices.allow", buf);
 188         if (r < 0)
 189                 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 190
 191         return r;
 192 }
 193
 194 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
 195         int r;
 196
 197         assert(c);
 198         assert(path);
 199
 200         if (mask == 0)
 201                 return;
 202
 203         if (mask & CGROUP_CPU) {
 204                 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
 205
 206                 sprintf(buf, "%lu\n", c->cpu_shares);
 207                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 208                 if (r < 0)
 209                         log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
 210         }
 211
 212         if (mask & CGROUP_BLKIO) {
 213                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 214                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 215                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 216                 CGroupBlockIODeviceWeight *w;
 217                 CGroupBlockIODeviceBandwidth *b;
 218
 219                 sprintf(buf, "%lu\n", c->blockio_weight);
 220                 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 221                 if (r < 0)
 222                         log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
 223
 224                 /* FIXME: no way to reset this list */
 225                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 226                         dev_t dev;
 227
 228                         r = lookup_blkio_device(w->path, &dev);
 229                         if (r < 0)
 230                                 continue;
 231
 232                         sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 233                         r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 234                         if (r < 0)
 235                                 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 236                 }
 237
 238                 /* FIXME: no way to reset this list */
 239                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 240                         const char *a;
 241                         dev_t dev;
 242
 243                         r = lookup_blkio_device(b->path, &dev);
 244                         if (r < 0)
 245                                 continue;
 246
 247                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 248
 249                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 250                         r = cg_set_attribute("blkio", path, a, buf);
 251                         if (r < 0)
 252                                 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
 253                 }
 254         }
 255
 256         if (mask & CGROUP_MEMORY) {
 257                 if (c->memory_limit != (uint64_t) -1) {
 258                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 259
 260                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 261                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 262                 } else
 263                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 264
 265                 if (r < 0)
 266                         log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 267         }
 268
 269         if (mask & CGROUP_DEVICE) {
 270                 CGroupDeviceAllow *a;
 271
 272                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 273                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 274                 else
 275                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 276                 if (r < 0)
 277                         log_error("Failed to reset devices.list on %s: %s", path, strerror(-r));
 278
 279                 if (c->device_policy == CGROUP_CLOSED ||
 280                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 281                         static const char auto_devices[] =
 282                                 "/dev/null\0" "rw\0"
 283                                 "/dev/zero\0" "rw\0"
 284                                 "/dev/full\0" "rw\0"
 285                                 "/dev/random\0" "rw\0"
 286                                 "/dev/urandom\0" "rw\0";
 287
 288                         const char *x, *y;
 289
 290                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 291                                 whitelist_device(path, x, y);
 292                 }
 293
 294                 LIST_FOREACH(device_allow, a, c->device_allow) {
 295                         char acc[4];
 296                         unsigned k = 0;
 297
 298                         if (a->r)
 299                                 acc[k++] = 'r';
 300                         if (a->w)
 301                                 acc[k++] = 'w';
 302                         if (a->m)
 303                                 acc[k++] = 'm';
 304
 305                         if (k == 0)
 306                                 continue;
 307
 308                         acc[k++] = 0;
 309                         whitelist_device(path, a->path, acc);
 310                 }
 311         }
 312 }
 313
 314 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 315         CGroupControllerMask mask = 0;
 316
 317         /* Figure out which controllers we need */
 318
 319         if (c->cpu_accounting || c->cpu_shares != 1024)
 320                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 321
 322         if (c->blockio_accounting ||
 323             c->blockio_weight != 1000 ||
 324             c->blockio_device_weights ||
 325             c->blockio_device_bandwidths)
 326                 mask |= CGROUP_BLKIO;
 327
 328         if (c->memory_accounting ||
 329             c->memory_limit != (uint64_t) -1)
 330                 mask |= CGROUP_MEMORY;
 331
 332         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 333                 mask |= CGROUP_DEVICE;
 334
 335         return mask;
 336 }
 337
 338 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 339         CGroupContext *c;
 340
 341         c = unit_get_cgroup_context(u);
 342         if (!c)
 343                 return 0;
 344
 345         return cgroup_context_get_mask(c);
 346 }
 347
 348 CGroupControllerMask unit_get_members_mask(Unit *u) {
 349         assert(u);
 350
 351         if (u->cgroup_members_mask_valid)
 352                 return u->cgroup_members_mask;
 353
 354         u->cgroup_members_mask = 0;
 355
 356         if (u->type == UNIT_SLICE) {
 357                 Unit *member;
 358                 Iterator i;
 359
 360                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 361
 362                         if (member == u)
 363                                 continue;
 364
 365                          if (UNIT_DEREF(member->slice) != u)
 366                                 continue;
 367
 368                         u->cgroup_members_mask |=
 369                                 unit_get_cgroup_mask(member) |
 370                                 unit_get_members_mask(member);
 371                 }
 372         }
 373
 374         u->cgroup_members_mask_valid = true;
 375         return u->cgroup_members_mask;
 376 }
 377
 378 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 379         CGroupControllerMask m;
 380
 381         assert(u);
 382
 383         if (UNIT_ISSET(u->slice))
 384                 m = unit_get_members_mask(UNIT_DEREF(u->slice));
 385         else
 386                 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 387
 388         /* Sibling propagation is only relevant for weight-based
 389          * controllers, so let's mask out everything else */
 390         return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
 391 }
 392
 393 CGroupControllerMask unit_get_target_mask(Unit *u) {
 394         CGroupControllerMask mask;
 395
 396         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 397         mask &= u->manager->cgroup_supported;
 398
 399         return mask;
 400 }
 401
 402 /* Recurse from a unit up through its containing slices, propagating
 403  * mask bits upward. A unit is also member of itself. */
 404 void unit_update_cgroup_members_masks(Unit *u) {
 405         CGroupControllerMask m;
 406         bool more;
 407
 408         assert(u);
 409
 410         /* Calculate subtree mask */
 411         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 412
 413         /* See if anything changed from the previous invocation. If
 414          * not, we're done. */
 415         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 416                 return;
 417
 418         more =
 419                 u->cgroup_subtree_mask_valid &&
 420                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 421                 ((~m & u->cgroup_subtree_mask) == 0);
 422
 423         u->cgroup_subtree_mask = m;
 424         u->cgroup_subtree_mask_valid = true;
 425
 426         if (UNIT_ISSET(u->slice)) {
 427                 Unit *s = UNIT_DEREF(u->slice);
 428
 429                 if (more)
 430                         /* There's more set now than before. We
 431                          * propagate the new mask to the parent's mask
 432                          * (not caring if it actually was valid or
 433                          * not). */
 434
 435                         s->cgroup_members_mask |= m;
 436
 437                 else
 438                         /* There's less set now than before (or we
 439                          * don't know), we need to recalculate
 440                          * everything, so let's invalidate the
 441                          * parent's members mask */
 442
 443                         s->cgroup_members_mask_valid = false;
 444
 445                 /* And now make sure that this change also hits our
 446                  * grandparents */
 447                 unit_update_cgroup_members_masks(s);
 448         }
 449 }
 450
 451 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 452         Unit *u = userdata;
 453
 454         assert(mask != 0);
 455         assert(u);
 456
 457         while (u) {
 458                 if (u->cgroup_path &&
 459                     u->cgroup_realized &&
 460                     (u->cgroup_realized_mask & mask) == mask)
 461                         return u->cgroup_path;
 462
 463                 u = UNIT_DEREF(u->slice);
 464         }
 465
 466         return NULL;
 467 }
 468
 469 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 470         _cleanup_free_ char *path = NULL;
 471         int r;
 472
 473         assert(u);
 474
 475         path = unit_default_cgroup_path(u);
 476         if (!path)
 477                 return log_oom();
 478
 479         r = hashmap_put(u->manager->cgroup_unit, path, u);
 480         if (r < 0) {
 481                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 482                 return r;
 483         }
 484         if (r > 0) {
 485                 u->cgroup_path = path;
 486                 path = NULL;
 487         }
 488
 489         /* First, create our own group */
 490         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 491         if (r < 0) {
 492                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 493                 return r;
 494         }
 495
 496         /* Keep track that this is now realized */
 497         u->cgroup_realized = true;
 498         u->cgroup_realized_mask = mask;
 499
 500         /* Then, possibly move things over */
 501         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 502         if (r < 0)
 503                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 504
 505         return 0;
 506 }
 507
 508 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 509         assert(u);
 510
 511         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 512 }
 513
 514 /* Check if necessary controllers and attributes for a unit are in place.
 515  *
 516  * If so, do nothing.
 517  * If not, create paths, move processes over, and set attributes.
 518  *
 519  * Returns 0 on success and < 0 on failure. */
 520 static int unit_realize_cgroup_now(Unit *u) {
 521         CGroupControllerMask mask;
 522         int r;
 523
 524         assert(u);
 525
 526         if (u->in_cgroup_queue) {
 527                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 528                 u->in_cgroup_queue = false;
 529         }
 530
 531         mask = unit_get_target_mask(u);
 532
 533         if (unit_has_mask_realized(u, mask))
 534                 return 0;
 535
 536         /* First, realize parents */
 537         if (UNIT_ISSET(u->slice)) {
 538                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
 539                 if (r < 0)
 540                         return r;
 541         }
 542
 543         /* And then do the real work */
 544         r = unit_create_cgroups(u, mask);
 545         if (r < 0)
 546                 return r;
 547
 548         /* Finally, apply the necessary attributes. */
 549         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
 550
 551         return 0;
 552 }
 553
 554 static void unit_add_to_cgroup_queue(Unit *u) {
 555
 556         if (u->in_cgroup_queue)
 557                 return;
 558
 559         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 560         u->in_cgroup_queue = true;
 561 }
 562
 563 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 564         Unit *i;
 565         unsigned n = 0;
 566         int r;
 567
 568         while ((i = m->cgroup_queue)) {
 569                 assert(i->in_cgroup_queue);
 570
 571                 r = unit_realize_cgroup_now(i);
 572                 if (r < 0)
 573                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 574
 575                 n++;
 576         }
 577
 578         return n;
 579 }
 580
 581 static void unit_queue_siblings(Unit *u) {
 582         Unit *slice;
 583
 584         /* This adds the siblings of the specified unit and the
 585          * siblings of all parent units to the cgroup queue. (But
 586          * neither the specified unit itself nor the parents.) */
 587
 588         while ((slice = UNIT_DEREF(u->slice))) {
 589                 Iterator i;
 590                 Unit *m;
 591
 592                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 593                         if (m == u)
 594                                 continue;
 595
 596                         /* Skip units that have a dependency on the slice
 597                          * but aren't actually in it. */
 598                         if (UNIT_DEREF(m->slice) != slice)
 599                                 continue;
 600
 601                         /* No point in doing cgroup application for units
 602                          * without active processes. */
 603                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 604                                 continue;
 605
 606                         /* If the unit doesn't need any new controllers
 607                          * and has current ones realized, it doesn't need
 608                          * any changes. */
 609                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 610                                 continue;
 611
 612                         unit_add_to_cgroup_queue(m);
 613                 }
 614
 615                 u = slice;
 616         }
 617 }
 618
 619 int unit_realize_cgroup(Unit *u) {
 620         CGroupContext *c;
 621
 622         assert(u);
 623
 624         c = unit_get_cgroup_context(u);
 625         if (!c)
 626                 return 0;
 627
 628         /* So, here's the deal: when realizing the cgroups for this
 629          * unit, we need to first create all parents, but there's more
 630          * actually: for the weight-based controllers we also need to
 631          * make sure that all our siblings (i.e. units that are in the
 632          * same slice as we are) have cgroups, too. Otherwise things
 633          * would become very uneven as each of their processes would
 634          * get as much resources as all our group together. This call
 635          * will synchronously create the parent cgroups, but will
 636          * defer work on the siblings to the next event loop
 637          * iteration. */
 638
 639         /* Add all sibling slices to the cgroup queue. */
 640         unit_queue_siblings(u);
 641
 642         /* And realize this one now (and apply the values) */
 643         return unit_realize_cgroup_now(u);
 644 }
 645
 646 void unit_destroy_cgroup(Unit *u) {
 647         int r;
 648
 649         assert(u);
 650
 651         if (!u->cgroup_path)
 652                 return;
 653
 654         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 655         if (r < 0)
 656                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 657
 658         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 659
 660         free(u->cgroup_path);
 661         u->cgroup_path = NULL;
 662         u->cgroup_realized = false;
 663         u->cgroup_realized_mask = 0;
 664
 665 }
 666
 667 pid_t unit_search_main_pid(Unit *u) {
 668         _cleanup_fclose_ FILE *f = NULL;
 669         pid_t pid = 0, npid, mypid;
 670
 671         assert(u);
 672
 673         if (!u->cgroup_path)
 674                 return 0;
 675
 676         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 677                 return 0;
 678
 679         mypid = getpid();
 680         while (cg_read_pid(f, &npid) > 0)  {
 681                 pid_t ppid;
 682
 683                 if (npid == pid)
 684                         continue;
 685
 686                 /* Ignore processes that aren't our kids */
 687                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 688                         continue;
 689
 690                 if (pid != 0) {
 691                         /* Dang, there's more than one daemonized PID
 692                         in this group, so we don't know what process
 693                         is the main process. */
 694                         pid = 0;
 695                         break;
 696                 }
 697
 698                 pid = npid;
 699         }
 700
 701         return pid;
 702 }
 703
 704 int manager_setup_cgroup(Manager *m) {
 705         _cleanup_free_ char *path = NULL;
 706         char *e;
 707         int r;
 708
 709         assert(m);
 710
 711         /* 0. Be nice to Ingo Molnar #628004 */
 712         if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
 713                 log_warning("No control group support available, not creating root group.");
 714                 return 0;
 715         }
 716
 717         /* 1. Determine hierarchy */
 718         free(m->cgroup_root);
 719         m->cgroup_root = NULL;
 720
 721         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 722         if (r < 0) {
 723                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 724                 return r;
 725         }
 726
 727         /* LEGACY: Already in /system.slice? If so, let's cut this
 728          * off. This is to support live upgrades from older systemd
 729          * versions where PID 1 was moved there. */
 730         if (m->running_as == SYSTEMD_SYSTEM) {
 731                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 732                 if (!e)
 733                         e = endswith(m->cgroup_root, "/system");
 734                 if (e)
 735                         *e = 0;
 736         }
 737
 738         /* And make sure to store away the root value without trailing
 739          * slash, even for the root dir, so that we can easily prepend
 740          * it everywhere. */
 741         if (streq(m->cgroup_root, "/"))
 742                 m->cgroup_root[0] = 0;
 743
 744         /* 2. Show data */
 745         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 746         if (r < 0) {
 747                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 748                 return r;
 749         }
 750
 751         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 752
 753         /* 3. Install agent */
 754         if (m->running_as == SYSTEMD_SYSTEM) {
 755                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 756                 if (r < 0)
 757                         log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 758                 else if (r > 0)
 759                         log_debug("Installed release agent.");
 760                 else
 761                         log_debug("Release agent already installed.");
 762         }
 763
 764         /* 4. Make sure we are in the root cgroup */
 765         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 766         if (r < 0) {
 767                 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 768                 return r;
 769         }
 770
 771         /* 5. And pin it, so that it cannot be unmounted */
 772         if (m->pin_cgroupfs_fd >= 0)
 773                 close_nointr_nofail(m->pin_cgroupfs_fd);
 774
 775         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 776         if (r < 0) {
 777                 log_error("Failed to open pin file: %m");
 778                 return -errno;
 779         }
 780
 781         /* 6. Figure out which controllers are supported */
 782         m->cgroup_supported = cg_mask_supported();
 783
 784         /* 7.  Always enable hierarchial support if it exists... */
 785         cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 786
 787         return 0;
 788 }
 789
 790 void manager_shutdown_cgroup(Manager *m, bool delete) {
 791         assert(m);
 792
 793         /* We can't really delete the group, since we are in it. But
 794          * let's trim it. */
 795         if (delete && m->cgroup_root)
 796                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 797
 798         if (m->pin_cgroupfs_fd >= 0) {
 799                 close_nointr_nofail(m->pin_cgroupfs_fd);
 800                 m->pin_cgroupfs_fd = -1;
 801         }
 802
 803         free(m->cgroup_root);
 804         m->cgroup_root = NULL;
 805 }
 806
 807 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 808         char *p;
 809         Unit *u;
 810
 811         assert(m);
 812         assert(cgroup);
 813
 814         u = hashmap_get(m->cgroup_unit, cgroup);
 815         if (u)
 816                 return u;
 817
 818         p = strdupa(cgroup);
 819         for (;;) {
 820                 char *e;
 821
 822                 e = strrchr(p, '/');
 823                 if (e == p || !e)
 824                         return NULL;
 825
 826                 *e = 0;
 827
 828                 u = hashmap_get(m->cgroup_unit, p);
 829                 if (u)
 830                         return u;
 831         }
 832 }
 833
 834 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 835         _cleanup_free_ char *cgroup = NULL;
 836         int r;
 837
 838         assert(m);
 839
 840         if (pid <= 1)
 841                 return NULL;
 842
 843         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 844         if (r < 0)
 845                 return NULL;
 846
 847         return manager_get_unit_by_cgroup(m, cgroup);
 848 }
 849
 850 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 851         Unit *u;
 852         int r;
 853
 854         assert(m);
 855         assert(cgroup);
 856
 857         u = manager_get_unit_by_cgroup(m, cgroup);
 858         if (u) {
 859                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
 860                 if (r > 0) {
 861                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
 862                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
 863
 864                         unit_add_to_gc_queue(u);
 865                 }
 866         }
 867
 868         return 0;
 869 }
 870
 871 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
 872         [CGROUP_AUTO] = "auto",
 873         [CGROUP_CLOSED] = "closed",
 874         [CGROUP_STRICT] = "strict",
 875 };
 876
 877 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);