src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  31
  32 void cgroup_context_init(CGroupContext *c) {
  33         assert(c);
  34
  35         /* Initialize everything to the kernel defaults, assuming the
  36          * structure is preinitialized to 0 */
  37
  38         c->cpu_shares = (unsigned long) -1;
  39         c->startup_cpu_shares = (unsigned long) -1;
  40         c->memory_limit = (uint64_t) -1;
  41         c->blockio_weight = (unsigned long) -1;
  42         c->startup_blockio_weight = (unsigned long) -1;
  43
  44         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  45 }
  46
  47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  48         assert(c);
  49         assert(a);
  50
  51         LIST_REMOVE(device_allow, c->device_allow, a);
  52         free(a->path);
  53         free(a);
  54 }
  55
  56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  57         assert(c);
  58         assert(w);
  59
  60         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  61         free(w->path);
  62         free(w);
  63 }
  64
  65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  66         assert(c);
  67         assert(b);
  68
  69         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  70         free(b->path);
  71         free(b);
  72 }
  73
  74 void cgroup_context_done(CGroupContext *c) {
  75         assert(c);
  76
  77         while (c->blockio_device_weights)
  78                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  79
  80         while (c->blockio_device_bandwidths)
  81                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  82
  83         while (c->device_allow)
  84                 cgroup_context_free_device_allow(c, c->device_allow);
  85 }
  86
  87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  88         CGroupBlockIODeviceBandwidth *b;
  89         CGroupBlockIODeviceWeight *w;
  90         CGroupDeviceAllow *a;
  91         char u[FORMAT_TIMESPAN_MAX];
  92
  93         assert(c);
  94         assert(f);
  95
  96         prefix = strempty(prefix);
  97
  98         fprintf(f,
  99                 "%sCPUAccounting=%s\n"
 100                 "%sBlockIOAccounting=%s\n"
 101                 "%sMemoryAccounting=%s\n"
 102                 "%sCPUShares=%lu\n"
 103                 "%sStartupCPUShares=%lu\n"
 104                 "%sCPUQuotaPerSecSec=%s\n"
 105                 "%sBlockIOWeight=%lu\n"
 106                 "%sStartupBlockIOWeight=%lu\n"
 107                 "%sMemoryLimit=%" PRIu64 "\n"
 108                 "%sDevicePolicy=%s\n"
 109                 "%sDelegate=%s\n",
 110                 prefix, yes_no(c->cpu_accounting),
 111                 prefix, yes_no(c->blockio_accounting),
 112                 prefix, yes_no(c->memory_accounting),
 113                 prefix, c->cpu_shares,
 114                 prefix, c->startup_cpu_shares,
 115                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 116                 prefix, c->blockio_weight,
 117                 prefix, c->startup_blockio_weight,
 118                 prefix, c->memory_limit,
 119                 prefix, cgroup_device_policy_to_string(c->device_policy),
 120                 prefix, yes_no(c->delegate));
 121
 122         LIST_FOREACH(device_allow, a, c->device_allow)
 123                 fprintf(f,
 124                         "%sDeviceAllow=%s %s%s%s\n",
 125                         prefix,
 126                         a->path,
 127                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 128
 129         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 130                 fprintf(f,
 131                         "%sBlockIODeviceWeight=%s %lu",
 132                         prefix,
 133                         w->path,
 134                         w->weight);
 135
 136         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 137                 char buf[FORMAT_BYTES_MAX];
 138
 139                 fprintf(f,
 140                         "%s%s=%s %s\n",
 141                         prefix,
 142                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 143                         b->path,
 144                         format_bytes(buf, sizeof(buf), b->bandwidth));
 145         }
 146 }
 147
 148 static int lookup_blkio_device(const char *p, dev_t *dev) {
 149         struct stat st;
 150         int r;
 151
 152         assert(p);
 153         assert(dev);
 154
 155         r = stat(p, &st);
 156         if (r < 0)
 157                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 158
 159         if (S_ISBLK(st.st_mode))
 160                 *dev = st.st_rdev;
 161         else if (major(st.st_dev) != 0) {
 162                 /* If this is not a device node then find the block
 163                  * device this file is stored on */
 164                 *dev = st.st_dev;
 165
 166                 /* If this is a partition, try to get the originating
 167                  * block device */
 168                 block_get_whole_disk(*dev, dev);
 169         } else {
 170                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 171                 return -ENODEV;
 172         }
 173
 174         return 0;
 175 }
 176
 177 static int whitelist_device(const char *path, const char *node, const char *acc) {
 178         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 179         struct stat st;
 180         int r;
 181
 182         assert(path);
 183         assert(acc);
 184
 185         if (stat(node, &st) < 0) {
 186                 log_warning("Couldn't stat device %s", node);
 187                 return -errno;
 188         }
 189
 190         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 191                 log_warning("%s is not a device.", node);
 192                 return -ENODEV;
 193         }
 194
 195         sprintf(buf,
 196                 "%c %u:%u %s",
 197                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 198                 major(st.st_rdev), minor(st.st_rdev),
 199                 acc);
 200
 201         r = cg_set_attribute("devices", path, "devices.allow", buf);
 202         if (r < 0)
 203                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 204                                "Failed to set devices.allow on %s: %m", path);
 205
 206         return r;
 207 }
 208
 209 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 210         _cleanup_fclose_ FILE *f = NULL;
 211         char line[LINE_MAX];
 212         bool good = false;
 213         int r;
 214
 215         assert(path);
 216         assert(acc);
 217         assert(type == 'b' || type == 'c');
 218
 219         f = fopen("/proc/devices", "re");
 220         if (!f)
 221                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 222
 223         FOREACH_LINE(line, f, goto fail) {
 224                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 225                 unsigned maj;
 226
 227                 truncate_nl(line);
 228
 229                 if (type == 'c' && streq(line, "Character devices:")) {
 230                         good = true;
 231                         continue;
 232                 }
 233
 234                 if (type == 'b' && streq(line, "Block devices:")) {
 235                         good = true;
 236                         continue;
 237                 }
 238
 239                 if (isempty(line)) {
 240                         good = false;
 241                         continue;
 242                 }
 243
 244                 if (!good)
 245                         continue;
 246
 247                 p = strstrip(line);
 248
 249                 w = strpbrk(p, WHITESPACE);
 250                 if (!w)
 251                         continue;
 252                 *w = 0;
 253
 254                 r = safe_atou(p, &maj);
 255                 if (r < 0)
 256                         continue;
 257                 if (maj <= 0)
 258                         continue;
 259
 260                 w++;
 261                 w += strspn(w, WHITESPACE);
 262
 263                 if (fnmatch(name, w, 0) != 0)
 264                         continue;
 265
 266                 sprintf(buf,
 267                         "%c %u:* %s",
 268                         type,
 269                         maj,
 270                         acc);
 271
 272                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 273                 if (r < 0)
 274                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 275                                        "Failed to set devices.allow on %s: %m", path);
 276         }
 277
 278         return 0;
 279
 280 fail:
 281         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 282         return -errno;
 283 }
 284
 285 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 286         bool is_root;
 287         int r;
 288
 289         assert(c);
 290         assert(path);
 291
 292         if (mask == 0)
 293                 return;
 294
 295         /* Some cgroup attributes are not support on the root cgroup,
 296          * hence silently ignore */
 297         is_root = isempty(path) || path_equal(path, "/");
 298
 299         /* We generally ignore errors caused by read-only mounted
 300          * cgroup trees (assuming we are running in a container then),
 301          * and missing cgroups, i.e. EROFS and ENOENT. */
 302
 303         if ((mask & CGROUP_CPU) && !is_root) {
 304                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 305
 306                 sprintf(buf, "%lu\n",
 307                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 308                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 309                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 310                 if (r < 0)
 311                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 312                                        "Failed to set cpu.shares on %s: %m", path);
 313
 314                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 315                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 316                 if (r < 0)
 317                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 318                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 319
 320                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 321                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 322                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 323                 } else
 324                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 325                 if (r < 0)
 326                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 327                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 328         }
 329
 330         if (mask & CGROUP_BLKIO) {
 331                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 332                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 333                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 334                 CGroupBlockIODeviceWeight *w;
 335                 CGroupBlockIODeviceBandwidth *b;
 336
 337                 if (!is_root) {
 338                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 339                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 340                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 341                         if (r < 0)
 342                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 343                                                "Failed to set blkio.weight on %s: %m", path);
 344
 345                         /* FIXME: no way to reset this list */
 346                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 347                                 dev_t dev;
 348
 349                                 r = lookup_blkio_device(w->path, &dev);
 350                                 if (r < 0)
 351                                         continue;
 352
 353                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 354                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 355                                 if (r < 0)
 356                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 357                                                        "Failed to set blkio.weight_device on %s: %m", path);
 358                         }
 359                 }
 360
 361                 /* FIXME: no way to reset this list */
 362                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 363                         const char *a;
 364                         dev_t dev;
 365
 366                         r = lookup_blkio_device(b->path, &dev);
 367                         if (r < 0)
 368                                 continue;
 369
 370                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 371
 372                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 373                         r = cg_set_attribute("blkio", path, a, buf);
 374                         if (r < 0)
 375                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 376                                                "Failed to set %s on %s: %m", a, path);
 377                 }
 378         }
 379
 380         if (mask & CGROUP_MEMORY) {
 381                 if (c->memory_limit != (uint64_t) -1) {
 382                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 383
 384                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 385                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 386                 } else
 387                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 388
 389                 if (r < 0)
 390                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 391                                        "Failed to set memory.limit_in_bytes on %s: %m", path);
 392         }
 393
 394         if ((mask & CGROUP_DEVICE) && !is_root) {
 395                 CGroupDeviceAllow *a;
 396
 397                 /* Changing the devices list of a populated cgroup
 398                  * might result in EINVAL, hence ignore EINVAL
 399                  * here. */
 400
 401                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 402                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 403                 else
 404                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 405                 if (r < 0)
 406                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 407                                        "Failed to reset devices.list on %s: %m", path);
 408
 409                 if (c->device_policy == CGROUP_CLOSED ||
 410                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 411                         static const char auto_devices[] =
 412                                 "/dev/null\0" "rwm\0"
 413                                 "/dev/zero\0" "rwm\0"
 414                                 "/dev/full\0" "rwm\0"
 415                                 "/dev/random\0" "rwm\0"
 416                                 "/dev/urandom\0" "rwm\0"
 417                                 "/dev/tty\0" "rwm\0"
 418                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 419
 420                         const char *x, *y;
 421
 422                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 423                                 whitelist_device(path, x, y);
 424
 425                         whitelist_major(path, "pts", 'c', "rw");
 426                         whitelist_major(path, "kdbus", 'c', "rw");
 427                         whitelist_major(path, "kdbus/*", 'c', "rw");
 428                 }
 429
 430                 LIST_FOREACH(device_allow, a, c->device_allow) {
 431                         char acc[4];
 432                         unsigned k = 0;
 433
 434                         if (a->r)
 435                                 acc[k++] = 'r';
 436                         if (a->w)
 437                                 acc[k++] = 'w';
 438                         if (a->m)
 439                                 acc[k++] = 'm';
 440
 441                         if (k == 0)
 442                                 continue;
 443
 444                         acc[k++] = 0;
 445
 446                         if (startswith(a->path, "/dev/"))
 447                                 whitelist_device(path, a->path, acc);
 448                         else if (startswith(a->path, "block-"))
 449                                 whitelist_major(path, a->path + 6, 'b', acc);
 450                         else if (startswith(a->path, "char-"))
 451                                 whitelist_major(path, a->path + 5, 'c', acc);
 452                         else
 453                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 454                 }
 455         }
 456 }
 457
 458 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 459         CGroupControllerMask mask = 0;
 460
 461         /* Figure out which controllers we need */
 462
 463         if (c->cpu_accounting ||
 464             c->cpu_shares != (unsigned long) -1 ||
 465             c->startup_cpu_shares != (unsigned long) -1 ||
 466             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 467                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 468
 469         if (c->blockio_accounting ||
 470             c->blockio_weight != (unsigned long) -1 ||
 471             c->startup_blockio_weight != (unsigned long) -1 ||
 472             c->blockio_device_weights ||
 473             c->blockio_device_bandwidths)
 474                 mask |= CGROUP_BLKIO;
 475
 476         if (c->memory_accounting ||
 477             c->memory_limit != (uint64_t) -1)
 478                 mask |= CGROUP_MEMORY;
 479
 480         if (c->device_allow ||
 481             c->device_policy != CGROUP_AUTO)
 482                 mask |= CGROUP_DEVICE;
 483
 484         return mask;
 485 }
 486
 487 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 488         CGroupContext *c;
 489
 490         c = unit_get_cgroup_context(u);
 491         if (!c)
 492                 return 0;
 493
 494         /* If delegation is turned on, then turn on all cgroups,
 495          * unless the process we fork into it is known to drop
 496          * privileges anyway, and shouldn't get access to the
 497          * controllers anyway. */
 498
 499         if (c->delegate) {
 500                 ExecContext *e;
 501
 502                 e = unit_get_exec_context(u);
 503                 if (!e || exec_context_maintains_privileges(e))
 504                         return _CGROUP_CONTROLLER_MASK_ALL;
 505         }
 506
 507         return cgroup_context_get_mask(c);
 508 }
 509
 510 CGroupControllerMask unit_get_members_mask(Unit *u) {
 511         assert(u);
 512
 513         if (u->cgroup_members_mask_valid)
 514                 return u->cgroup_members_mask;
 515
 516         u->cgroup_members_mask = 0;
 517
 518         if (u->type == UNIT_SLICE) {
 519                 Unit *member;
 520                 Iterator i;
 521
 522                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 523
 524                         if (member == u)
 525                                 continue;
 526
 527                         if (UNIT_DEREF(member->slice) != u)
 528                                 continue;
 529
 530                         u->cgroup_members_mask |=
 531                                 unit_get_cgroup_mask(member) |
 532                                 unit_get_members_mask(member);
 533                 }
 534         }
 535
 536         u->cgroup_members_mask_valid = true;
 537         return u->cgroup_members_mask;
 538 }
 539
 540 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 541         assert(u);
 542
 543         if (UNIT_ISSET(u->slice))
 544                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 545
 546         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 547 }
 548
 549 CGroupControllerMask unit_get_target_mask(Unit *u) {
 550         CGroupControllerMask mask;
 551
 552         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 553         mask &= u->manager->cgroup_supported;
 554
 555         return mask;
 556 }
 557
 558 /* Recurse from a unit up through its containing slices, propagating
 559  * mask bits upward. A unit is also member of itself. */
 560 void unit_update_cgroup_members_masks(Unit *u) {
 561         CGroupControllerMask m;
 562         bool more;
 563
 564         assert(u);
 565
 566         /* Calculate subtree mask */
 567         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 568
 569         /* See if anything changed from the previous invocation. If
 570          * not, we're done. */
 571         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 572                 return;
 573
 574         more =
 575                 u->cgroup_subtree_mask_valid &&
 576                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 577                 ((~m & u->cgroup_subtree_mask) == 0);
 578
 579         u->cgroup_subtree_mask = m;
 580         u->cgroup_subtree_mask_valid = true;
 581
 582         if (UNIT_ISSET(u->slice)) {
 583                 Unit *s = UNIT_DEREF(u->slice);
 584
 585                 if (more)
 586                         /* There's more set now than before. We
 587                          * propagate the new mask to the parent's mask
 588                          * (not caring if it actually was valid or
 589                          * not). */
 590
 591                         s->cgroup_members_mask |= m;
 592
 593                 else
 594                         /* There's less set now than before (or we
 595                          * don't know), we need to recalculate
 596                          * everything, so let's invalidate the
 597                          * parent's members mask */
 598
 599                         s->cgroup_members_mask_valid = false;
 600
 601                 /* And now make sure that this change also hits our
 602                  * grandparents */
 603                 unit_update_cgroup_members_masks(s);
 604         }
 605 }
 606
 607 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 608         Unit *u = userdata;
 609
 610         assert(mask != 0);
 611         assert(u);
 612
 613         while (u) {
 614                 if (u->cgroup_path &&
 615                     u->cgroup_realized &&
 616                     (u->cgroup_realized_mask & mask) == mask)
 617                         return u->cgroup_path;
 618
 619                 u = UNIT_DEREF(u->slice);
 620         }
 621
 622         return NULL;
 623 }
 624
 625 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 626         CGroupContext *c;
 627         int r;
 628
 629         assert(u);
 630
 631         c = unit_get_cgroup_context(u);
 632         if (!c)
 633                 return 0;
 634
 635         if (!u->cgroup_path) {
 636                 _cleanup_free_ char *path = NULL;
 637
 638                 path = unit_default_cgroup_path(u);
 639                 if (!path)
 640                         return log_oom();
 641
 642                 r = hashmap_put(u->manager->cgroup_unit, path, u);
 643                 if (r < 0) {
 644                         log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 645                         return r;
 646                 }
 647                 if (r > 0) {
 648                         u->cgroup_path = path;
 649                         path = NULL;
 650                 }
 651         }
 652
 653         /* First, create our own group */
 654         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 655         if (r < 0)
 656                 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
 657
 658         /* Keep track that this is now realized */
 659         u->cgroup_realized = true;
 660         u->cgroup_realized_mask = mask;
 661
 662         if (u->type != UNIT_SLICE && !c->delegate) {
 663
 664                 /* Then, possibly move things over, but not if
 665                  * subgroups may contain processes, which is the case
 666                  * for slice and delegation units. */
 667                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 668                 if (r < 0)
 669                         log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
 670         }
 671
 672         return 0;
 673 }
 674
 675 int unit_attach_pids_to_cgroup(Unit *u) {
 676         int r;
 677         assert(u);
 678
 679         r = unit_realize_cgroup(u);
 680         if (r < 0)
 681                 return r;
 682
 683         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 684         if (r < 0)
 685                 return r;
 686
 687         return 0;
 688 }
 689
 690 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 691         assert(u);
 692
 693         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 694 }
 695
 696 /* Check if necessary controllers and attributes for a unit are in place.
 697  *
 698  * If so, do nothing.
 699  * If not, create paths, move processes over, and set attributes.
 700  *
 701  * Returns 0 on success and < 0 on failure. */
 702 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 703         CGroupControllerMask mask;
 704         int r;
 705
 706         assert(u);
 707
 708         if (u->in_cgroup_queue) {
 709                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 710                 u->in_cgroup_queue = false;
 711         }
 712
 713         mask = unit_get_target_mask(u);
 714
 715         if (unit_has_mask_realized(u, mask))
 716                 return 0;
 717
 718         /* First, realize parents */
 719         if (UNIT_ISSET(u->slice)) {
 720                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 721                 if (r < 0)
 722                         return r;
 723         }
 724
 725         /* And then do the real work */
 726         r = unit_create_cgroups(u, mask);
 727         if (r < 0)
 728                 return r;
 729
 730         /* Finally, apply the necessary attributes. */
 731         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 732
 733         return 0;
 734 }
 735
 736 static void unit_add_to_cgroup_queue(Unit *u) {
 737
 738         if (u->in_cgroup_queue)
 739                 return;
 740
 741         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 742         u->in_cgroup_queue = true;
 743 }
 744
 745 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 746         ManagerState state;
 747         unsigned n = 0;
 748         Unit *i;
 749         int r;
 750
 751         state = manager_state(m);
 752
 753         while ((i = m->cgroup_queue)) {
 754                 assert(i->in_cgroup_queue);
 755
 756                 r = unit_realize_cgroup_now(i, state);
 757                 if (r < 0)
 758                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
 759
 760                 n++;
 761         }
 762
 763         return n;
 764 }
 765
 766 static void unit_queue_siblings(Unit *u) {
 767         Unit *slice;
 768
 769         /* This adds the siblings of the specified unit and the
 770          * siblings of all parent units to the cgroup queue. (But
 771          * neither the specified unit itself nor the parents.) */
 772
 773         while ((slice = UNIT_DEREF(u->slice))) {
 774                 Iterator i;
 775                 Unit *m;
 776
 777                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 778                         if (m == u)
 779                                 continue;
 780
 781                         /* Skip units that have a dependency on the slice
 782                          * but aren't actually in it. */
 783                         if (UNIT_DEREF(m->slice) != slice)
 784                                 continue;
 785
 786                         /* No point in doing cgroup application for units
 787                          * without active processes. */
 788                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 789                                 continue;
 790
 791                         /* If the unit doesn't need any new controllers
 792                          * and has current ones realized, it doesn't need
 793                          * any changes. */
 794                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 795                                 continue;
 796
 797                         unit_add_to_cgroup_queue(m);
 798                 }
 799
 800                 u = slice;
 801         }
 802 }
 803
 804 int unit_realize_cgroup(Unit *u) {
 805         CGroupContext *c;
 806
 807         assert(u);
 808
 809         c = unit_get_cgroup_context(u);
 810         if (!c)
 811                 return 0;
 812
 813         /* So, here's the deal: when realizing the cgroups for this
 814          * unit, we need to first create all parents, but there's more
 815          * actually: for the weight-based controllers we also need to
 816          * make sure that all our siblings (i.e. units that are in the
 817          * same slice as we are) have cgroups, too. Otherwise, things
 818          * would become very uneven as each of their processes would
 819          * get as much resources as all our group together. This call
 820          * will synchronously create the parent cgroups, but will
 821          * defer work on the siblings to the next event loop
 822          * iteration. */
 823
 824         /* Add all sibling slices to the cgroup queue. */
 825         unit_queue_siblings(u);
 826
 827         /* And realize this one now (and apply the values) */
 828         return unit_realize_cgroup_now(u, manager_state(u->manager));
 829 }
 830
 831 void unit_destroy_cgroup_if_empty(Unit *u) {
 832         int r;
 833
 834         assert(u);
 835
 836         if (!u->cgroup_path)
 837                 return;
 838
 839         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 840         if (r < 0) {
 841                 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
 842                 return;
 843         }
 844
 845         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 846
 847         free(u->cgroup_path);
 848         u->cgroup_path = NULL;
 849         u->cgroup_realized = false;
 850         u->cgroup_realized_mask = 0;
 851 }
 852
 853 pid_t unit_search_main_pid(Unit *u) {
 854         _cleanup_fclose_ FILE *f = NULL;
 855         pid_t pid = 0, npid, mypid;
 856
 857         assert(u);
 858
 859         if (!u->cgroup_path)
 860                 return 0;
 861
 862         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 863                 return 0;
 864
 865         mypid = getpid();
 866         while (cg_read_pid(f, &npid) > 0)  {
 867                 pid_t ppid;
 868
 869                 if (npid == pid)
 870                         continue;
 871
 872                 /* Ignore processes that aren't our kids */
 873                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 874                         continue;
 875
 876                 if (pid != 0) {
 877                         /* Dang, there's more than one daemonized PID
 878                         in this group, so we don't know what process
 879                         is the main process. */
 880                         pid = 0;
 881                         break;
 882                 }
 883
 884                 pid = npid;
 885         }
 886
 887         return pid;
 888 }
 889
 890 int manager_setup_cgroup(Manager *m) {
 891         _cleanup_free_ char *path = NULL;
 892         int r;
 893
 894         assert(m);
 895
 896         /* 1. Determine hierarchy */
 897         free(m->cgroup_root);
 898         m->cgroup_root = NULL;
 899
 900         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 901         if (r < 0)
 902                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
 903
 904         /* LEGACY: Already in /system.slice? If so, let's cut this
 905          * off. This is to support live upgrades from older systemd
 906          * versions where PID 1 was moved there. */
 907         if (m->running_as == SYSTEMD_SYSTEM) {
 908                 char *e;
 909
 910                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 911                 if (!e)
 912                         e = endswith(m->cgroup_root, "/system");
 913                 if (e)
 914                         *e = 0;
 915         }
 916
 917         /* And make sure to store away the root value without trailing
 918          * slash, even for the root dir, so that we can easily prepend
 919          * it everywhere. */
 920         if (streq(m->cgroup_root, "/"))
 921                 m->cgroup_root[0] = 0;
 922
 923         /* 2. Show data */
 924         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 925         if (r < 0)
 926                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
 927
 928         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 929         if (!m->test_run) {
 930
 931                 /* 3. Install agent */
 932                 if (m->running_as == SYSTEMD_SYSTEM) {
 933                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 934                         if (r < 0)
 935                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
 936                         else if (r > 0)
 937                                 log_debug("Installed release agent.");
 938                         else
 939                                 log_debug("Release agent already installed.");
 940                 }
 941
 942                 /* 4. Make sure we are in the root cgroup */
 943                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 944                 if (r < 0)
 945                         return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
 946
 947                 /* 5. And pin it, so that it cannot be unmounted */
 948                 safe_close(m->pin_cgroupfs_fd);
 949
 950                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 951                 if (m->pin_cgroupfs_fd < 0)
 952                         return log_error_errno(errno, "Failed to open pin file: %m");
 953
 954                 /* 6.  Always enable hierarchial support if it exists... */
 955                 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 956         }
 957
 958         /* 7. Figure out which controllers are supported */
 959         m->cgroup_supported = cg_mask_supported();
 960
 961         return 0;
 962 }
 963
 964 void manager_shutdown_cgroup(Manager *m, bool delete) {
 965         assert(m);
 966
 967         /* We can't really delete the group, since we are in it. But
 968          * let's trim it. */
 969         if (delete && m->cgroup_root)
 970                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 971
 972         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 973
 974         free(m->cgroup_root);
 975         m->cgroup_root = NULL;
 976 }
 977
 978 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 979         char *p;
 980         Unit *u;
 981
 982         assert(m);
 983         assert(cgroup);
 984
 985         u = hashmap_get(m->cgroup_unit, cgroup);
 986         if (u)
 987                 return u;
 988
 989         p = strdupa(cgroup);
 990         for (;;) {
 991                 char *e;
 992
 993                 e = strrchr(p, '/');
 994                 if (e == p || !e)
 995                         return NULL;
 996
 997                 *e = 0;
 998
 999                 u = hashmap_get(m->cgroup_unit, p);
1000                 if (u)
1001                         return u;
1002         }
1003 }
1004
1005 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1006         _cleanup_free_ char *cgroup = NULL;
1007         int r;
1008
1009         assert(m);
1010
1011         if (pid <= 1)
1012                 return NULL;
1013
1014         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1015         if (r < 0)
1016                 return NULL;
1017
1018         return manager_get_unit_by_cgroup(m, cgroup);
1019 }
1020
1021 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1022         Unit *u;
1023         int r;
1024
1025         assert(m);
1026         assert(cgroup);
1027
1028         u = manager_get_unit_by_cgroup(m, cgroup);
1029         if (u) {
1030                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1031                 if (r > 0) {
1032                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1033                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1034
1035                         unit_add_to_gc_queue(u);
1036                 }
1037         }
1038
1039         return 0;
1040 }
1041
1042 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1043         [CGROUP_AUTO] = "auto",
1044         [CGROUP_CLOSED] = "closed",
1045         [CGROUP_STRICT] = "strict",
1046 };
1047
1048 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);