src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23
  24 #include "path-util.h"
  25 #include "special.h"
  26 #include "cgroup-util.h"
  27 #include "cgroup.h"
  28
  29 void cgroup_context_init(CGroupContext *c) {
  30         assert(c);
  31
  32         /* Initialize everything to the kernel defaults, assuming the
  33          * structure is preinitialized to 0 */
  34
  35         c->cpu_shares = 1024;
  36         c->memory_limit = (uint64_t) -1;
  37         c->blockio_weight = 1000;
  38 }
  39
  40 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  41         assert(c);
  42         assert(a);
  43
  44         LIST_REMOVE(device_allow, c->device_allow, a);
  45         free(a->path);
  46         free(a);
  47 }
  48
  49 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  50         assert(c);
  51         assert(w);
  52
  53         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  54         free(w->path);
  55         free(w);
  56 }
  57
  58 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  59         assert(c);
  60         assert(b);
  61
  62         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  63         free(b->path);
  64         free(b);
  65 }
  66
  67 void cgroup_context_done(CGroupContext *c) {
  68         assert(c);
  69
  70         while (c->blockio_device_weights)
  71                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  72
  73         while (c->blockio_device_bandwidths)
  74                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  75
  76         while (c->device_allow)
  77                 cgroup_context_free_device_allow(c, c->device_allow);
  78 }
  79
  80 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  81         CGroupBlockIODeviceBandwidth *b;
  82         CGroupBlockIODeviceWeight *w;
  83         CGroupDeviceAllow *a;
  84
  85         assert(c);
  86         assert(f);
  87
  88         prefix = strempty(prefix);
  89
  90         fprintf(f,
  91                 "%sCPUAccounting=%s\n"
  92                 "%sBlockIOAccounting=%s\n"
  93                 "%sMemoryAccounting=%s\n"
  94                 "%sCPUShares=%lu\n"
  95                 "%sBlockIOWeight=%lu\n"
  96                 "%sMemoryLimit=%" PRIu64 "\n"
  97                 "%sDevicePolicy=%s\n",
  98                 prefix, yes_no(c->cpu_accounting),
  99                 prefix, yes_no(c->blockio_accounting),
 100                 prefix, yes_no(c->memory_accounting),
 101                 prefix, c->cpu_shares,
 102                 prefix, c->blockio_weight,
 103                 prefix, c->memory_limit,
 104                 prefix, cgroup_device_policy_to_string(c->device_policy));
 105
 106         LIST_FOREACH(device_allow, a, c->device_allow)
 107                 fprintf(f,
 108                         "%sDeviceAllow=%s %s%s%s\n",
 109                         prefix,
 110                         a->path,
 111                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 112
 113         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 114                 fprintf(f,
 115                         "%sBlockIODeviceWeight=%s %lu",
 116                         prefix,
 117                         w->path,
 118                         w->weight);
 119
 120         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 121                 char buf[FORMAT_BYTES_MAX];
 122
 123                 fprintf(f,
 124                         "%s%s=%s %s\n",
 125                         prefix,
 126                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 127                         b->path,
 128                         format_bytes(buf, sizeof(buf), b->bandwidth));
 129         }
 130 }
 131
 132 static int lookup_blkio_device(const char *p, dev_t *dev) {
 133         struct stat st;
 134         int r;
 135
 136         assert(p);
 137         assert(dev);
 138
 139         r = stat(p, &st);
 140         if (r < 0) {
 141                 log_warning("Couldn't stat device %s: %m", p);
 142                 return -errno;
 143         }
 144
 145         if (S_ISBLK(st.st_mode))
 146                 *dev = st.st_rdev;
 147         else if (major(st.st_dev) != 0) {
 148                 /* If this is not a device node then find the block
 149                  * device this file is stored on */
 150                 *dev = st.st_dev;
 151
 152                 /* If this is a partition, try to get the originating
 153                  * block device */
 154                 block_get_whole_disk(*dev, dev);
 155         } else {
 156                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 157                 return -ENODEV;
 158         }
 159
 160         return 0;
 161 }
 162
 163 static int whitelist_device(const char *path, const char *node, const char *acc) {
 164         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 165         struct stat st;
 166         int r;
 167
 168         assert(path);
 169         assert(acc);
 170
 171         if (stat(node, &st) < 0) {
 172                 log_warning("Couldn't stat device %s", node);
 173                 return -errno;
 174         }
 175
 176         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 177                 log_warning("%s is not a device.", node);
 178                 return -ENODEV;
 179         }
 180
 181         sprintf(buf,
 182                 "%c %u:%u %s",
 183                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 184                 major(st.st_rdev), minor(st.st_rdev),
 185                 acc);
 186
 187         r = cg_set_attribute("devices", path, "devices.allow", buf);
 188         if (r < 0)
 189                 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 190
 191         return r;
 192 }
 193
 194 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 195         _cleanup_fclose_ FILE *f = NULL;
 196         char line[LINE_MAX];
 197         bool good = false;
 198         int r;
 199
 200         assert(path);
 201         assert(acc);
 202         assert(type == 'b' || type == 'c');
 203
 204         f = fopen("/proc/devices", "re");
 205         if (!f) {
 206                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 207                 return -errno;
 208         }
 209
 210         FOREACH_LINE(line, f, goto fail) {
 211                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 212                 unsigned maj;
 213
 214                 truncate_nl(line);
 215
 216                 if (type == 'c' && streq(line, "Character devices:")) {
 217                         good = true;
 218                         continue;
 219                 }
 220
 221                 if (type == 'b' && streq(line, "Block devices:")) {
 222                         good = true;
 223                         continue;
 224                 }
 225
 226                 if (isempty(line)) {
 227                         good = false;
 228                         continue;
 229                 }
 230
 231                 if (!good)
 232                         continue;
 233
 234                 p = strstrip(line);
 235
 236                 w = strpbrk(p, WHITESPACE);
 237                 if (!w)
 238                         continue;
 239                 *w = 0;
 240
 241                 r = safe_atou(p, &maj);
 242                 if (r < 0)
 243                         continue;
 244                 if (maj <= 0)
 245                         continue;
 246
 247                 w++;
 248                 w += strspn(w, WHITESPACE);
 249                 if (!streq(w, name))
 250                         continue;
 251
 252                 sprintf(buf,
 253                         "%c %u:* %s",
 254                         type,
 255                         maj,
 256                         acc);
 257
 258                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 259                 if (r < 0)
 260                         log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 261         }
 262
 263         return 0;
 264
 265 fail:
 266         log_warning("Failed to read /proc/devices: %m");
 267         return -errno;
 268 }
 269
 270 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
 271         int r;
 272
 273         assert(c);
 274         assert(path);
 275
 276         if (mask == 0)
 277                 return;
 278
 279         if (mask & CGROUP_CPU) {
 280                 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
 281
 282                 sprintf(buf, "%lu\n", c->cpu_shares);
 283                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 284                 if (r < 0)
 285                         log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
 286         }
 287
 288         if (mask & CGROUP_BLKIO) {
 289                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 290                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 291                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 292                 CGroupBlockIODeviceWeight *w;
 293                 CGroupBlockIODeviceBandwidth *b;
 294
 295                 sprintf(buf, "%lu\n", c->blockio_weight);
 296                 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 297                 if (r < 0)
 298                         log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
 299
 300                 /* FIXME: no way to reset this list */
 301                 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 302                         dev_t dev;
 303
 304                         r = lookup_blkio_device(w->path, &dev);
 305                         if (r < 0)
 306                                 continue;
 307
 308                         sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 309                         r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 310                         if (r < 0)
 311                                 log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 312                 }
 313
 314                 /* FIXME: no way to reset this list */
 315                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 316                         const char *a;
 317                         dev_t dev;
 318
 319                         r = lookup_blkio_device(b->path, &dev);
 320                         if (r < 0)
 321                                 continue;
 322
 323                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 324
 325                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 326                         r = cg_set_attribute("blkio", path, a, buf);
 327                         if (r < 0)
 328                                 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
 329                 }
 330         }
 331
 332         if (mask & CGROUP_MEMORY) {
 333                 if (c->memory_limit != (uint64_t) -1) {
 334                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 335
 336                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 337                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 338                 } else
 339                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 340
 341                 if (r < 0)
 342                         log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 343         }
 344
 345         if (mask & CGROUP_DEVICE) {
 346                 CGroupDeviceAllow *a;
 347
 348                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 349                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 350                 else
 351                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 352                 if (r < 0)
 353                         log_error("Failed to reset devices.list on %s: %s", path, strerror(-r));
 354
 355                 if (c->device_policy == CGROUP_CLOSED ||
 356                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 357                         static const char auto_devices[] =
 358                                 "/dev/null\0" "rw\0"
 359                                 "/dev/zero\0" "rw\0"
 360                                 "/dev/full\0" "rw\0"
 361                                 "/dev/random\0" "rw\0"
 362                                 "/dev/urandom\0" "rw\0";
 363
 364                         const char *x, *y;
 365
 366                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 367                                 whitelist_device(path, x, y);
 368                 }
 369
 370                 LIST_FOREACH(device_allow, a, c->device_allow) {
 371                         char acc[4];
 372                         unsigned k = 0;
 373
 374                         if (a->r)
 375                                 acc[k++] = 'r';
 376                         if (a->w)
 377                                 acc[k++] = 'w';
 378                         if (a->m)
 379                                 acc[k++] = 'm';
 380
 381                         if (k == 0)
 382                                 continue;
 383
 384                         acc[k++] = 0;
 385
 386                         if (startswith(a->path, "/dev/"))
 387                                 whitelist_device(path, a->path, acc);
 388                         else if (startswith(a->path, "block-"))
 389                                 whitelist_major(path, a->path + 6, 'b', acc);
 390                         else if (startswith(a->path, "char-"))
 391                                 whitelist_major(path, a->path + 5, 'c', acc);
 392                         else
 393                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 394                 }
 395         }
 396 }
 397
 398 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 399         CGroupControllerMask mask = 0;
 400
 401         /* Figure out which controllers we need */
 402
 403         if (c->cpu_accounting || c->cpu_shares != 1024)
 404                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 405
 406         if (c->blockio_accounting ||
 407             c->blockio_weight != 1000 ||
 408             c->blockio_device_weights ||
 409             c->blockio_device_bandwidths)
 410                 mask |= CGROUP_BLKIO;
 411
 412         if (c->memory_accounting ||
 413             c->memory_limit != (uint64_t) -1)
 414                 mask |= CGROUP_MEMORY;
 415
 416         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 417                 mask |= CGROUP_DEVICE;
 418
 419         return mask;
 420 }
 421
 422 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 423         CGroupContext *c;
 424
 425         c = unit_get_cgroup_context(u);
 426         if (!c)
 427                 return 0;
 428
 429         return cgroup_context_get_mask(c);
 430 }
 431
 432 CGroupControllerMask unit_get_members_mask(Unit *u) {
 433         assert(u);
 434
 435         if (u->cgroup_members_mask_valid)
 436                 return u->cgroup_members_mask;
 437
 438         u->cgroup_members_mask = 0;
 439
 440         if (u->type == UNIT_SLICE) {
 441                 Unit *member;
 442                 Iterator i;
 443
 444                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 445
 446                         if (member == u)
 447                                 continue;
 448
 449                         if (UNIT_DEREF(member->slice) != u)
 450                                 continue;
 451
 452                         u->cgroup_members_mask |=
 453                                 unit_get_cgroup_mask(member) |
 454                                 unit_get_members_mask(member);
 455                 }
 456         }
 457
 458         u->cgroup_members_mask_valid = true;
 459         return u->cgroup_members_mask;
 460 }
 461
 462 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 463         CGroupControllerMask m;
 464
 465         assert(u);
 466
 467         if (UNIT_ISSET(u->slice))
 468                 m = unit_get_members_mask(UNIT_DEREF(u->slice));
 469         else
 470                 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 471
 472         /* Sibling propagation is only relevant for weight-based
 473          * controllers, so let's mask out everything else */
 474         return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
 475 }
 476
 477 CGroupControllerMask unit_get_target_mask(Unit *u) {
 478         CGroupControllerMask mask;
 479
 480         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 481         mask &= u->manager->cgroup_supported;
 482
 483         return mask;
 484 }
 485
 486 /* Recurse from a unit up through its containing slices, propagating
 487  * mask bits upward. A unit is also member of itself. */
 488 void unit_update_cgroup_members_masks(Unit *u) {
 489         CGroupControllerMask m;
 490         bool more;
 491
 492         assert(u);
 493
 494         /* Calculate subtree mask */
 495         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 496
 497         /* See if anything changed from the previous invocation. If
 498          * not, we're done. */
 499         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 500                 return;
 501
 502         more =
 503                 u->cgroup_subtree_mask_valid &&
 504                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 505                 ((~m & u->cgroup_subtree_mask) == 0);
 506
 507         u->cgroup_subtree_mask = m;
 508         u->cgroup_subtree_mask_valid = true;
 509
 510         if (UNIT_ISSET(u->slice)) {
 511                 Unit *s = UNIT_DEREF(u->slice);
 512
 513                 if (more)
 514                         /* There's more set now than before. We
 515                          * propagate the new mask to the parent's mask
 516                          * (not caring if it actually was valid or
 517                          * not). */
 518
 519                         s->cgroup_members_mask |= m;
 520
 521                 else
 522                         /* There's less set now than before (or we
 523                          * don't know), we need to recalculate
 524                          * everything, so let's invalidate the
 525                          * parent's members mask */
 526
 527                         s->cgroup_members_mask_valid = false;
 528
 529                 /* And now make sure that this change also hits our
 530                  * grandparents */
 531                 unit_update_cgroup_members_masks(s);
 532         }
 533 }
 534
 535 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 536         Unit *u = userdata;
 537
 538         assert(mask != 0);
 539         assert(u);
 540
 541         while (u) {
 542                 if (u->cgroup_path &&
 543                     u->cgroup_realized &&
 544                     (u->cgroup_realized_mask & mask) == mask)
 545                         return u->cgroup_path;
 546
 547                 u = UNIT_DEREF(u->slice);
 548         }
 549
 550         return NULL;
 551 }
 552
 553 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 554         _cleanup_free_ char *path = NULL;
 555         int r;
 556
 557         assert(u);
 558
 559         path = unit_default_cgroup_path(u);
 560         if (!path)
 561                 return log_oom();
 562
 563         r = hashmap_put(u->manager->cgroup_unit, path, u);
 564         if (r < 0) {
 565                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 566                 return r;
 567         }
 568         if (r > 0) {
 569                 u->cgroup_path = path;
 570                 path = NULL;
 571         }
 572
 573         /* First, create our own group */
 574         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 575         if (r < 0) {
 576                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 577                 return r;
 578         }
 579
 580         /* Keep track that this is now realized */
 581         u->cgroup_realized = true;
 582         u->cgroup_realized_mask = mask;
 583
 584         /* Then, possibly move things over */
 585         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 586         if (r < 0)
 587                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 588
 589         return 0;
 590 }
 591
 592 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 593         assert(u);
 594
 595         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 596 }
 597
 598 /* Check if necessary controllers and attributes for a unit are in place.
 599  *
 600  * If so, do nothing.
 601  * If not, create paths, move processes over, and set attributes.
 602  *
 603  * Returns 0 on success and < 0 on failure. */
 604 static int unit_realize_cgroup_now(Unit *u) {
 605         CGroupControllerMask mask;
 606         int r;
 607
 608         assert(u);
 609
 610         if (u->in_cgroup_queue) {
 611                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 612                 u->in_cgroup_queue = false;
 613         }
 614
 615         mask = unit_get_target_mask(u);
 616
 617         if (unit_has_mask_realized(u, mask))
 618                 return 0;
 619
 620         /* First, realize parents */
 621         if (UNIT_ISSET(u->slice)) {
 622                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
 623                 if (r < 0)
 624                         return r;
 625         }
 626
 627         /* And then do the real work */
 628         r = unit_create_cgroups(u, mask);
 629         if (r < 0)
 630                 return r;
 631
 632         /* Finally, apply the necessary attributes. */
 633         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
 634
 635         return 0;
 636 }
 637
 638 static void unit_add_to_cgroup_queue(Unit *u) {
 639
 640         if (u->in_cgroup_queue)
 641                 return;
 642
 643         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 644         u->in_cgroup_queue = true;
 645 }
 646
 647 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 648         Unit *i;
 649         unsigned n = 0;
 650         int r;
 651
 652         while ((i = m->cgroup_queue)) {
 653                 assert(i->in_cgroup_queue);
 654
 655                 r = unit_realize_cgroup_now(i);
 656                 if (r < 0)
 657                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 658
 659                 n++;
 660         }
 661
 662         return n;
 663 }
 664
 665 static void unit_queue_siblings(Unit *u) {
 666         Unit *slice;
 667
 668         /* This adds the siblings of the specified unit and the
 669          * siblings of all parent units to the cgroup queue. (But
 670          * neither the specified unit itself nor the parents.) */
 671
 672         while ((slice = UNIT_DEREF(u->slice))) {
 673                 Iterator i;
 674                 Unit *m;
 675
 676                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 677                         if (m == u)
 678                                 continue;
 679
 680                         /* Skip units that have a dependency on the slice
 681                          * but aren't actually in it. */
 682                         if (UNIT_DEREF(m->slice) != slice)
 683                                 continue;
 684
 685                         /* No point in doing cgroup application for units
 686                          * without active processes. */
 687                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 688                                 continue;
 689
 690                         /* If the unit doesn't need any new controllers
 691                          * and has current ones realized, it doesn't need
 692                          * any changes. */
 693                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 694                                 continue;
 695
 696                         unit_add_to_cgroup_queue(m);
 697                 }
 698
 699                 u = slice;
 700         }
 701 }
 702
 703 int unit_realize_cgroup(Unit *u) {
 704         CGroupContext *c;
 705
 706         assert(u);
 707
 708         c = unit_get_cgroup_context(u);
 709         if (!c)
 710                 return 0;
 711
 712         /* So, here's the deal: when realizing the cgroups for this
 713          * unit, we need to first create all parents, but there's more
 714          * actually: for the weight-based controllers we also need to
 715          * make sure that all our siblings (i.e. units that are in the
 716          * same slice as we are) have cgroups, too. Otherwise, things
 717          * would become very uneven as each of their processes would
 718          * get as much resources as all our group together. This call
 719          * will synchronously create the parent cgroups, but will
 720          * defer work on the siblings to the next event loop
 721          * iteration. */
 722
 723         /* Add all sibling slices to the cgroup queue. */
 724         unit_queue_siblings(u);
 725
 726         /* And realize this one now (and apply the values) */
 727         return unit_realize_cgroup_now(u);
 728 }
 729
 730 void unit_destroy_cgroup(Unit *u) {
 731         int r;
 732
 733         assert(u);
 734
 735         if (!u->cgroup_path)
 736                 return;
 737
 738         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 739         if (r < 0)
 740                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 741
 742         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 743
 744         free(u->cgroup_path);
 745         u->cgroup_path = NULL;
 746         u->cgroup_realized = false;
 747         u->cgroup_realized_mask = 0;
 748
 749 }
 750
 751 pid_t unit_search_main_pid(Unit *u) {
 752         _cleanup_fclose_ FILE *f = NULL;
 753         pid_t pid = 0, npid, mypid;
 754
 755         assert(u);
 756
 757         if (!u->cgroup_path)
 758                 return 0;
 759
 760         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 761                 return 0;
 762
 763         mypid = getpid();
 764         while (cg_read_pid(f, &npid) > 0)  {
 765                 pid_t ppid;
 766
 767                 if (npid == pid)
 768                         continue;
 769
 770                 /* Ignore processes that aren't our kids */
 771                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 772                         continue;
 773
 774                 if (pid != 0) {
 775                         /* Dang, there's more than one daemonized PID
 776                         in this group, so we don't know what process
 777                         is the main process. */
 778                         pid = 0;
 779                         break;
 780                 }
 781
 782                 pid = npid;
 783         }
 784
 785         return pid;
 786 }
 787
 788 int manager_setup_cgroup(Manager *m) {
 789         _cleanup_free_ char *path = NULL;
 790         char *e;
 791         int r;
 792
 793         assert(m);
 794
 795         /* 0. Be nice to Ingo Molnar #628004 */
 796         if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
 797                 log_warning("No control group support available, not creating root group.");
 798                 return 0;
 799         }
 800
 801         /* 1. Determine hierarchy */
 802         free(m->cgroup_root);
 803         m->cgroup_root = NULL;
 804
 805         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 806         if (r < 0) {
 807                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 808                 return r;
 809         }
 810
 811         /* LEGACY: Already in /system.slice? If so, let's cut this
 812          * off. This is to support live upgrades from older systemd
 813          * versions where PID 1 was moved there. */
 814         if (m->running_as == SYSTEMD_SYSTEM) {
 815                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 816                 if (!e)
 817                         e = endswith(m->cgroup_root, "/system");
 818                 if (e)
 819                         *e = 0;
 820         }
 821
 822         /* And make sure to store away the root value without trailing
 823          * slash, even for the root dir, so that we can easily prepend
 824          * it everywhere. */
 825         if (streq(m->cgroup_root, "/"))
 826                 m->cgroup_root[0] = 0;
 827
 828         /* 2. Show data */
 829         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 830         if (r < 0) {
 831                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 832                 return r;
 833         }
 834
 835         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 836
 837         /* 3. Install agent */
 838         if (m->running_as == SYSTEMD_SYSTEM) {
 839                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 840                 if (r < 0)
 841                         log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 842                 else if (r > 0)
 843                         log_debug("Installed release agent.");
 844                 else
 845                         log_debug("Release agent already installed.");
 846         }
 847
 848         /* 4. Make sure we are in the root cgroup */
 849         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 850         if (r < 0) {
 851                 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 852                 return r;
 853         }
 854
 855         /* 5. And pin it, so that it cannot be unmounted */
 856         if (m->pin_cgroupfs_fd >= 0)
 857                 close_nointr_nofail(m->pin_cgroupfs_fd);
 858
 859         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 860         if (r < 0) {
 861                 log_error("Failed to open pin file: %m");
 862                 return -errno;
 863         }
 864
 865         /* 6. Figure out which controllers are supported */
 866         m->cgroup_supported = cg_mask_supported();
 867
 868         /* 7.  Always enable hierarchial support if it exists... */
 869         cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 870
 871         return 0;
 872 }
 873
 874 void manager_shutdown_cgroup(Manager *m, bool delete) {
 875         assert(m);
 876
 877         /* We can't really delete the group, since we are in it. But
 878          * let's trim it. */
 879         if (delete && m->cgroup_root)
 880                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 881
 882         if (m->pin_cgroupfs_fd >= 0) {
 883                 close_nointr_nofail(m->pin_cgroupfs_fd);
 884                 m->pin_cgroupfs_fd = -1;
 885         }
 886
 887         free(m->cgroup_root);
 888         m->cgroup_root = NULL;
 889 }
 890
 891 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 892         char *p;
 893         Unit *u;
 894
 895         assert(m);
 896         assert(cgroup);
 897
 898         u = hashmap_get(m->cgroup_unit, cgroup);
 899         if (u)
 900                 return u;
 901
 902         p = strdupa(cgroup);
 903         for (;;) {
 904                 char *e;
 905
 906                 e = strrchr(p, '/');
 907                 if (e == p || !e)
 908                         return NULL;
 909
 910                 *e = 0;
 911
 912                 u = hashmap_get(m->cgroup_unit, p);
 913                 if (u)
 914                         return u;
 915         }
 916 }
 917
 918 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 919         _cleanup_free_ char *cgroup = NULL;
 920         int r;
 921
 922         assert(m);
 923
 924         if (pid <= 1)
 925                 return NULL;
 926
 927         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 928         if (r < 0)
 929                 return NULL;
 930
 931         return manager_get_unit_by_cgroup(m, cgroup);
 932 }
 933
 934 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 935         Unit *u;
 936         int r;
 937
 938         assert(m);
 939         assert(cgroup);
 940
 941         u = manager_get_unit_by_cgroup(m, cgroup);
 942         if (u) {
 943                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
 944                 if (r > 0) {
 945                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
 946                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
 947
 948                         unit_add_to_gc_queue(u);
 949                 }
 950         }
 951
 952         return 0;
 953 }
 954
 955 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
 956         [CGROUP_AUTO] = "auto",
 957         [CGROUP_CLOSED] = "closed",
 958         [CGROUP_STRICT] = "strict",
 959 };
 960
 961 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);