src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 void cgroup_context_init(CGroupContext *c) {
  31         assert(c);
  32
  33         /* Initialize everything to the kernel defaults, assuming the
  34          * structure is preinitialized to 0 */
  35
  36         c->cpu_shares = 1024;
  37         c->memory_limit = (uint64_t) -1;
  38         c->blockio_weight = 1000;
  39 }
  40
  41 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  42         assert(c);
  43         assert(a);
  44
  45         LIST_REMOVE(device_allow, c->device_allow, a);
  46         free(a->path);
  47         free(a);
  48 }
  49
  50 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  51         assert(c);
  52         assert(w);
  53
  54         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  55         free(w->path);
  56         free(w);
  57 }
  58
  59 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  60         assert(c);
  61         assert(b);
  62
  63         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  64         free(b->path);
  65         free(b);
  66 }
  67
  68 void cgroup_context_done(CGroupContext *c) {
  69         assert(c);
  70
  71         while (c->blockio_device_weights)
  72                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  73
  74         while (c->blockio_device_bandwidths)
  75                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  76
  77         while (c->device_allow)
  78                 cgroup_context_free_device_allow(c, c->device_allow);
  79 }
  80
  81 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  82         CGroupBlockIODeviceBandwidth *b;
  83         CGroupBlockIODeviceWeight *w;
  84         CGroupDeviceAllow *a;
  85
  86         assert(c);
  87         assert(f);
  88
  89         prefix = strempty(prefix);
  90
  91         fprintf(f,
  92                 "%sCPUAccounting=%s\n"
  93                 "%sBlockIOAccounting=%s\n"
  94                 "%sMemoryAccounting=%s\n"
  95                 "%sCPUShares=%lu\n"
  96                 "%sBlockIOWeight=%lu\n"
  97                 "%sMemoryLimit=%" PRIu64 "\n"
  98                 "%sDevicePolicy=%s\n",
  99                 prefix, yes_no(c->cpu_accounting),
 100                 prefix, yes_no(c->blockio_accounting),
 101                 prefix, yes_no(c->memory_accounting),
 102                 prefix, c->cpu_shares,
 103                 prefix, c->blockio_weight,
 104                 prefix, c->memory_limit,
 105                 prefix, cgroup_device_policy_to_string(c->device_policy));
 106
 107         LIST_FOREACH(device_allow, a, c->device_allow)
 108                 fprintf(f,
 109                         "%sDeviceAllow=%s %s%s%s\n",
 110                         prefix,
 111                         a->path,
 112                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 113
 114         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 115                 fprintf(f,
 116                         "%sBlockIODeviceWeight=%s %lu",
 117                         prefix,
 118                         w->path,
 119                         w->weight);
 120
 121         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 122                 char buf[FORMAT_BYTES_MAX];
 123
 124                 fprintf(f,
 125                         "%s%s=%s %s\n",
 126                         prefix,
 127                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 128                         b->path,
 129                         format_bytes(buf, sizeof(buf), b->bandwidth));
 130         }
 131 }
 132
 133 static int lookup_blkio_device(const char *p, dev_t *dev) {
 134         struct stat st;
 135         int r;
 136
 137         assert(p);
 138         assert(dev);
 139
 140         r = stat(p, &st);
 141         if (r < 0) {
 142                 log_warning("Couldn't stat device %s: %m", p);
 143                 return -errno;
 144         }
 145
 146         if (S_ISBLK(st.st_mode))
 147                 *dev = st.st_rdev;
 148         else if (major(st.st_dev) != 0) {
 149                 /* If this is not a device node then find the block
 150                  * device this file is stored on */
 151                 *dev = st.st_dev;
 152
 153                 /* If this is a partition, try to get the originating
 154                  * block device */
 155                 block_get_whole_disk(*dev, dev);
 156         } else {
 157                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 158                 return -ENODEV;
 159         }
 160
 161         return 0;
 162 }
 163
 164 static int whitelist_device(const char *path, const char *node, const char *acc) {
 165         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 166         struct stat st;
 167         int r;
 168
 169         assert(path);
 170         assert(acc);
 171
 172         if (stat(node, &st) < 0) {
 173                 log_warning("Couldn't stat device %s", node);
 174                 return -errno;
 175         }
 176
 177         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 178                 log_warning("%s is not a device.", node);
 179                 return -ENODEV;
 180         }
 181
 182         sprintf(buf,
 183                 "%c %u:%u %s",
 184                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 185                 major(st.st_rdev), minor(st.st_rdev),
 186                 acc);
 187
 188         r = cg_set_attribute("devices", path, "devices.allow", buf);
 189         if (r < 0)
 190                 log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 191
 192         return r;
 193 }
 194
 195 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 196         _cleanup_fclose_ FILE *f = NULL;
 197         char line[LINE_MAX];
 198         bool good = false;
 199         int r;
 200
 201         assert(path);
 202         assert(acc);
 203         assert(type == 'b' || type == 'c');
 204
 205         f = fopen("/proc/devices", "re");
 206         if (!f) {
 207                 log_warning("Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 208                 return -errno;
 209         }
 210
 211         FOREACH_LINE(line, f, goto fail) {
 212                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 213                 unsigned maj;
 214
 215                 truncate_nl(line);
 216
 217                 if (type == 'c' && streq(line, "Character devices:")) {
 218                         good = true;
 219                         continue;
 220                 }
 221
 222                 if (type == 'b' && streq(line, "Block devices:")) {
 223                         good = true;
 224                         continue;
 225                 }
 226
 227                 if (isempty(line)) {
 228                         good = false;
 229                         continue;
 230                 }
 231
 232                 if (!good)
 233                         continue;
 234
 235                 p = strstrip(line);
 236
 237                 w = strpbrk(p, WHITESPACE);
 238                 if (!w)
 239                         continue;
 240                 *w = 0;
 241
 242                 r = safe_atou(p, &maj);
 243                 if (r < 0)
 244                         continue;
 245                 if (maj <= 0)
 246                         continue;
 247
 248                 w++;
 249                 w += strspn(w, WHITESPACE);
 250
 251                 if (fnmatch(name, w, 0) != 0)
 252                         continue;
 253
 254                 sprintf(buf,
 255                         "%c %u:* %s",
 256                         type,
 257                         maj,
 258                         acc);
 259
 260                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 261                 if (r < 0)
 262                         log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
 263         }
 264
 265         return 0;
 266
 267 fail:
 268         log_warning("Failed to read /proc/devices: %m");
 269         return -errno;
 270 }
 271
 272 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
 273         bool is_root;
 274         int r;
 275
 276         assert(c);
 277         assert(path);
 278
 279         if (mask == 0)
 280                 return;
 281
 282         /* Some cgroup attributes are not support on the root cgroup,
 283          * hence silently ignore */
 284         is_root = isempty(path) || path_equal(path, "/");
 285
 286         if ((mask & CGROUP_CPU) && !is_root) {
 287                 char buf[DECIMAL_STR_MAX(unsigned long) + 1];
 288
 289                 sprintf(buf, "%lu\n", c->cpu_shares);
 290                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 291                 if (r < 0)
 292                         log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
 293         }
 294
 295         if (mask & CGROUP_BLKIO) {
 296                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 297                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 298                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 299                 CGroupBlockIODeviceWeight *w;
 300                 CGroupBlockIODeviceBandwidth *b;
 301
 302                 if (!is_root) {
 303                         sprintf(buf, "%lu\n", c->blockio_weight);
 304                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 305                         if (r < 0)
 306                                 log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
 307
 308                         /* FIXME: no way to reset this list */
 309                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 310                                 dev_t dev;
 311
 312                                 r = lookup_blkio_device(w->path, &dev);
 313                                 if (r < 0)
 314                                         continue;
 315
 316                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 317                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 318                                 if (r < 0)
 319                                         log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 320                         }
 321                 }
 322
 323                 /* FIXME: no way to reset this list */
 324                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 325                         const char *a;
 326                         dev_t dev;
 327
 328                         r = lookup_blkio_device(b->path, &dev);
 329                         if (r < 0)
 330                                 continue;
 331
 332                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 333
 334                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 335                         r = cg_set_attribute("blkio", path, a, buf);
 336                         if (r < 0)
 337                                 log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
 338                 }
 339         }
 340
 341         if (mask & CGROUP_MEMORY) {
 342                 if (c->memory_limit != (uint64_t) -1) {
 343                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 344
 345                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 346                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 347                 } else
 348                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 349
 350                 if (r < 0)
 351                         log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 352         }
 353
 354         if ((mask & CGROUP_DEVICE) && !is_root) {
 355                 CGroupDeviceAllow *a;
 356
 357                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 358                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 359                 else
 360                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 361                 if (r < 0)
 362                         log_warning("Failed to reset devices.list on %s: %s", path, strerror(-r));
 363
 364                 if (c->device_policy == CGROUP_CLOSED ||
 365                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 366                         static const char auto_devices[] =
 367                                 "/dev/null\0" "rw\0"
 368                                 "/dev/zero\0" "rw\0"
 369                                 "/dev/full\0" "rw\0"
 370                                 "/dev/random\0" "rw\0"
 371                                 "/dev/urandom\0" "rw\0";
 372
 373                         const char *x, *y;
 374
 375                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 376                                 whitelist_device(path, x, y);
 377                 }
 378
 379                 LIST_FOREACH(device_allow, a, c->device_allow) {
 380                         char acc[4];
 381                         unsigned k = 0;
 382
 383                         if (a->r)
 384                                 acc[k++] = 'r';
 385                         if (a->w)
 386                                 acc[k++] = 'w';
 387                         if (a->m)
 388                                 acc[k++] = 'm';
 389
 390                         if (k == 0)
 391                                 continue;
 392
 393                         acc[k++] = 0;
 394
 395                         if (startswith(a->path, "/dev/"))
 396                                 whitelist_device(path, a->path, acc);
 397                         else if (startswith(a->path, "block-"))
 398                                 whitelist_major(path, a->path + 6, 'b', acc);
 399                         else if (startswith(a->path, "char-"))
 400                                 whitelist_major(path, a->path + 5, 'c', acc);
 401                         else
 402                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 403                 }
 404         }
 405 }
 406
 407 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 408         CGroupControllerMask mask = 0;
 409
 410         /* Figure out which controllers we need */
 411
 412         if (c->cpu_accounting || c->cpu_shares != 1024)
 413                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 414
 415         if (c->blockio_accounting ||
 416             c->blockio_weight != 1000 ||
 417             c->blockio_device_weights ||
 418             c->blockio_device_bandwidths)
 419                 mask |= CGROUP_BLKIO;
 420
 421         if (c->memory_accounting ||
 422             c->memory_limit != (uint64_t) -1)
 423                 mask |= CGROUP_MEMORY;
 424
 425         if (c->device_allow || c->device_policy != CGROUP_AUTO)
 426                 mask |= CGROUP_DEVICE;
 427
 428         return mask;
 429 }
 430
 431 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 432         CGroupContext *c;
 433
 434         c = unit_get_cgroup_context(u);
 435         if (!c)
 436                 return 0;
 437
 438         return cgroup_context_get_mask(c);
 439 }
 440
 441 CGroupControllerMask unit_get_members_mask(Unit *u) {
 442         assert(u);
 443
 444         if (u->cgroup_members_mask_valid)
 445                 return u->cgroup_members_mask;
 446
 447         u->cgroup_members_mask = 0;
 448
 449         if (u->type == UNIT_SLICE) {
 450                 Unit *member;
 451                 Iterator i;
 452
 453                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 454
 455                         if (member == u)
 456                                 continue;
 457
 458                         if (UNIT_DEREF(member->slice) != u)
 459                                 continue;
 460
 461                         u->cgroup_members_mask |=
 462                                 unit_get_cgroup_mask(member) |
 463                                 unit_get_members_mask(member);
 464                 }
 465         }
 466
 467         u->cgroup_members_mask_valid = true;
 468         return u->cgroup_members_mask;
 469 }
 470
 471 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 472         CGroupControllerMask m;
 473
 474         assert(u);
 475
 476         if (UNIT_ISSET(u->slice))
 477                 m = unit_get_members_mask(UNIT_DEREF(u->slice));
 478         else
 479                 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 480
 481         /* Sibling propagation is only relevant for weight-based
 482          * controllers, so let's mask out everything else */
 483         return m & (CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
 484 }
 485
 486 CGroupControllerMask unit_get_target_mask(Unit *u) {
 487         CGroupControllerMask mask;
 488
 489         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 490         mask &= u->manager->cgroup_supported;
 491
 492         return mask;
 493 }
 494
 495 /* Recurse from a unit up through its containing slices, propagating
 496  * mask bits upward. A unit is also member of itself. */
 497 void unit_update_cgroup_members_masks(Unit *u) {
 498         CGroupControllerMask m;
 499         bool more;
 500
 501         assert(u);
 502
 503         /* Calculate subtree mask */
 504         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 505
 506         /* See if anything changed from the previous invocation. If
 507          * not, we're done. */
 508         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 509                 return;
 510
 511         more =
 512                 u->cgroup_subtree_mask_valid &&
 513                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 514                 ((~m & u->cgroup_subtree_mask) == 0);
 515
 516         u->cgroup_subtree_mask = m;
 517         u->cgroup_subtree_mask_valid = true;
 518
 519         if (UNIT_ISSET(u->slice)) {
 520                 Unit *s = UNIT_DEREF(u->slice);
 521
 522                 if (more)
 523                         /* There's more set now than before. We
 524                          * propagate the new mask to the parent's mask
 525                          * (not caring if it actually was valid or
 526                          * not). */
 527
 528                         s->cgroup_members_mask |= m;
 529
 530                 else
 531                         /* There's less set now than before (or we
 532                          * don't know), we need to recalculate
 533                          * everything, so let's invalidate the
 534                          * parent's members mask */
 535
 536                         s->cgroup_members_mask_valid = false;
 537
 538                 /* And now make sure that this change also hits our
 539                  * grandparents */
 540                 unit_update_cgroup_members_masks(s);
 541         }
 542 }
 543
 544 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 545         Unit *u = userdata;
 546
 547         assert(mask != 0);
 548         assert(u);
 549
 550         while (u) {
 551                 if (u->cgroup_path &&
 552                     u->cgroup_realized &&
 553                     (u->cgroup_realized_mask & mask) == mask)
 554                         return u->cgroup_path;
 555
 556                 u = UNIT_DEREF(u->slice);
 557         }
 558
 559         return NULL;
 560 }
 561
 562 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 563         _cleanup_free_ char *path = NULL;
 564         int r;
 565
 566         assert(u);
 567
 568         path = unit_default_cgroup_path(u);
 569         if (!path)
 570                 return log_oom();
 571
 572         r = hashmap_put(u->manager->cgroup_unit, path, u);
 573         if (r < 0) {
 574                 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 575                 return r;
 576         }
 577         if (r > 0) {
 578                 u->cgroup_path = path;
 579                 path = NULL;
 580         }
 581
 582         /* First, create our own group */
 583         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 584         if (r < 0) {
 585                 log_error("Failed to create cgroup %s: %s", u->cgroup_path, strerror(-r));
 586                 return r;
 587         }
 588
 589         /* Keep track that this is now realized */
 590         u->cgroup_realized = true;
 591         u->cgroup_realized_mask = mask;
 592
 593         /* Then, possibly move things over */
 594         r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 595         if (r < 0)
 596                 log_warning("Failed to migrate cgroup from to %s: %s", u->cgroup_path, strerror(-r));
 597
 598         return 0;
 599 }
 600
 601 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 602         assert(u);
 603
 604         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 605 }
 606
 607 /* Check if necessary controllers and attributes for a unit are in place.
 608  *
 609  * If so, do nothing.
 610  * If not, create paths, move processes over, and set attributes.
 611  *
 612  * Returns 0 on success and < 0 on failure. */
 613 static int unit_realize_cgroup_now(Unit *u) {
 614         CGroupControllerMask mask;
 615         int r;
 616
 617         assert(u);
 618
 619         if (u->in_cgroup_queue) {
 620                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 621                 u->in_cgroup_queue = false;
 622         }
 623
 624         mask = unit_get_target_mask(u);
 625
 626         if (unit_has_mask_realized(u, mask))
 627                 return 0;
 628
 629         /* First, realize parents */
 630         if (UNIT_ISSET(u->slice)) {
 631                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice));
 632                 if (r < 0)
 633                         return r;
 634         }
 635
 636         /* And then do the real work */
 637         r = unit_create_cgroups(u, mask);
 638         if (r < 0)
 639                 return r;
 640
 641         /* Finally, apply the necessary attributes. */
 642         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path);
 643
 644         return 0;
 645 }
 646
 647 static void unit_add_to_cgroup_queue(Unit *u) {
 648
 649         if (u->in_cgroup_queue)
 650                 return;
 651
 652         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 653         u->in_cgroup_queue = true;
 654 }
 655
 656 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 657         Unit *i;
 658         unsigned n = 0;
 659         int r;
 660
 661         while ((i = m->cgroup_queue)) {
 662                 assert(i->in_cgroup_queue);
 663
 664                 r = unit_realize_cgroup_now(i);
 665                 if (r < 0)
 666                         log_warning("Failed to realize cgroups for queued unit %s: %s", i->id, strerror(-r));
 667
 668                 n++;
 669         }
 670
 671         return n;
 672 }
 673
 674 static void unit_queue_siblings(Unit *u) {
 675         Unit *slice;
 676
 677         /* This adds the siblings of the specified unit and the
 678          * siblings of all parent units to the cgroup queue. (But
 679          * neither the specified unit itself nor the parents.) */
 680
 681         while ((slice = UNIT_DEREF(u->slice))) {
 682                 Iterator i;
 683                 Unit *m;
 684
 685                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 686                         if (m == u)
 687                                 continue;
 688
 689                         /* Skip units that have a dependency on the slice
 690                          * but aren't actually in it. */
 691                         if (UNIT_DEREF(m->slice) != slice)
 692                                 continue;
 693
 694                         /* No point in doing cgroup application for units
 695                          * without active processes. */
 696                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 697                                 continue;
 698
 699                         /* If the unit doesn't need any new controllers
 700                          * and has current ones realized, it doesn't need
 701                          * any changes. */
 702                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 703                                 continue;
 704
 705                         unit_add_to_cgroup_queue(m);
 706                 }
 707
 708                 u = slice;
 709         }
 710 }
 711
 712 int unit_realize_cgroup(Unit *u) {
 713         CGroupContext *c;
 714
 715         assert(u);
 716
 717         c = unit_get_cgroup_context(u);
 718         if (!c)
 719                 return 0;
 720
 721         /* So, here's the deal: when realizing the cgroups for this
 722          * unit, we need to first create all parents, but there's more
 723          * actually: for the weight-based controllers we also need to
 724          * make sure that all our siblings (i.e. units that are in the
 725          * same slice as we are) have cgroups, too. Otherwise, things
 726          * would become very uneven as each of their processes would
 727          * get as much resources as all our group together. This call
 728          * will synchronously create the parent cgroups, but will
 729          * defer work on the siblings to the next event loop
 730          * iteration. */
 731
 732         /* Add all sibling slices to the cgroup queue. */
 733         unit_queue_siblings(u);
 734
 735         /* And realize this one now (and apply the values) */
 736         return unit_realize_cgroup_now(u);
 737 }
 738
 739 void unit_destroy_cgroup(Unit *u) {
 740         int r;
 741
 742         assert(u);
 743
 744         if (!u->cgroup_path)
 745                 return;
 746
 747         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 748         if (r < 0)
 749                 log_debug("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
 750
 751         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 752
 753         free(u->cgroup_path);
 754         u->cgroup_path = NULL;
 755         u->cgroup_realized = false;
 756         u->cgroup_realized_mask = 0;
 757
 758 }
 759
 760 pid_t unit_search_main_pid(Unit *u) {
 761         _cleanup_fclose_ FILE *f = NULL;
 762         pid_t pid = 0, npid, mypid;
 763
 764         assert(u);
 765
 766         if (!u->cgroup_path)
 767                 return 0;
 768
 769         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 770                 return 0;
 771
 772         mypid = getpid();
 773         while (cg_read_pid(f, &npid) > 0)  {
 774                 pid_t ppid;
 775
 776                 if (npid == pid)
 777                         continue;
 778
 779                 /* Ignore processes that aren't our kids */
 780                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 781                         continue;
 782
 783                 if (pid != 0) {
 784                         /* Dang, there's more than one daemonized PID
 785                         in this group, so we don't know what process
 786                         is the main process. */
 787                         pid = 0;
 788                         break;
 789                 }
 790
 791                 pid = npid;
 792         }
 793
 794         return pid;
 795 }
 796
 797 int manager_setup_cgroup(Manager *m) {
 798         _cleanup_free_ char *path = NULL;
 799         char *e;
 800         int r;
 801
 802         assert(m);
 803
 804         /* 0. Be nice to Ingo Molnar #628004 */
 805         if (path_is_mount_point("/sys/fs/cgroup/systemd", false) <= 0) {
 806                 log_warning("No control group support available, not creating root group.");
 807                 return 0;
 808         }
 809
 810         /* 1. Determine hierarchy */
 811         free(m->cgroup_root);
 812         m->cgroup_root = NULL;
 813
 814         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 815         if (r < 0) {
 816                 log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
 817                 return r;
 818         }
 819
 820         /* LEGACY: Already in /system.slice? If so, let's cut this
 821          * off. This is to support live upgrades from older systemd
 822          * versions where PID 1 was moved there. */
 823         if (m->running_as == SYSTEMD_SYSTEM) {
 824                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 825                 if (!e)
 826                         e = endswith(m->cgroup_root, "/system");
 827                 if (e)
 828                         *e = 0;
 829         }
 830
 831         /* And make sure to store away the root value without trailing
 832          * slash, even for the root dir, so that we can easily prepend
 833          * it everywhere. */
 834         if (streq(m->cgroup_root, "/"))
 835                 m->cgroup_root[0] = 0;
 836
 837         /* 2. Show data */
 838         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 839         if (r < 0) {
 840                 log_error("Cannot find cgroup mount point: %s", strerror(-r));
 841                 return r;
 842         }
 843
 844         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 845
 846         /* 3. Install agent */
 847         if (m->running_as == SYSTEMD_SYSTEM) {
 848                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 849                 if (r < 0)
 850                         log_warning("Failed to install release agent, ignoring: %s", strerror(-r));
 851                 else if (r > 0)
 852                         log_debug("Installed release agent.");
 853                 else
 854                         log_debug("Release agent already installed.");
 855         }
 856
 857         /* 4. Make sure we are in the root cgroup */
 858         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 859         if (r < 0) {
 860                 log_error("Failed to create root cgroup hierarchy: %s", strerror(-r));
 861                 return r;
 862         }
 863
 864         /* 5. And pin it, so that it cannot be unmounted */
 865         if (m->pin_cgroupfs_fd >= 0)
 866                 close_nointr_nofail(m->pin_cgroupfs_fd);
 867
 868         m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 869         if (r < 0) {
 870                 log_error("Failed to open pin file: %m");
 871                 return -errno;
 872         }
 873
 874         /* 6. Figure out which controllers are supported */
 875         m->cgroup_supported = cg_mask_supported();
 876
 877         /* 7.  Always enable hierarchial support if it exists... */
 878         cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 879
 880         return 0;
 881 }
 882
 883 void manager_shutdown_cgroup(Manager *m, bool delete) {
 884         assert(m);
 885
 886         /* We can't really delete the group, since we are in it. But
 887          * let's trim it. */
 888         if (delete && m->cgroup_root)
 889                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 890
 891         if (m->pin_cgroupfs_fd >= 0) {
 892                 close_nointr_nofail(m->pin_cgroupfs_fd);
 893                 m->pin_cgroupfs_fd = -1;
 894         }
 895
 896         free(m->cgroup_root);
 897         m->cgroup_root = NULL;
 898 }
 899
 900 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 901         char *p;
 902         Unit *u;
 903
 904         assert(m);
 905         assert(cgroup);
 906
 907         u = hashmap_get(m->cgroup_unit, cgroup);
 908         if (u)
 909                 return u;
 910
 911         p = strdupa(cgroup);
 912         for (;;) {
 913                 char *e;
 914
 915                 e = strrchr(p, '/');
 916                 if (e == p || !e)
 917                         return NULL;
 918
 919                 *e = 0;
 920
 921                 u = hashmap_get(m->cgroup_unit, p);
 922                 if (u)
 923                         return u;
 924         }
 925 }
 926
 927 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 928         _cleanup_free_ char *cgroup = NULL;
 929         int r;
 930
 931         assert(m);
 932
 933         if (pid <= 1)
 934                 return NULL;
 935
 936         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 937         if (r < 0)
 938                 return NULL;
 939
 940         return manager_get_unit_by_cgroup(m, cgroup);
 941 }
 942
 943 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
 944         Unit *u;
 945         int r;
 946
 947         assert(m);
 948         assert(cgroup);
 949
 950         u = manager_get_unit_by_cgroup(m, cgroup);
 951         if (u) {
 952                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
 953                 if (r > 0) {
 954                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
 955                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
 956
 957                         unit_add_to_gc_queue(u);
 958                 }
 959         }
 960
 961         return 0;
 962 }
 963
 964 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
 965         [CGROUP_AUTO] = "auto",
 966         [CGROUP_CLOSED] = "closed",
 967         [CGROUP_STRICT] = "strict",
 968 };
 969
 970 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);