src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "process-util.h"
  26 #include "path-util.h"
  27 #include "special.h"
  28 #include "cgroup-util.h"
  29 #include "cgroup.h"
  30
  31 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  32
  33 void cgroup_context_init(CGroupContext *c) {
  34         assert(c);
  35
  36         /* Initialize everything to the kernel defaults, assuming the
  37          * structure is preinitialized to 0 */
  38
  39         c->cpu_shares = (unsigned long) -1;
  40         c->startup_cpu_shares = (unsigned long) -1;
  41         c->memory_limit = (uint64_t) -1;
  42         c->blockio_weight = (unsigned long) -1;
  43         c->startup_blockio_weight = (unsigned long) -1;
  44
  45         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  46 }
  47
  48 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  49         assert(c);
  50         assert(a);
  51
  52         LIST_REMOVE(device_allow, c->device_allow, a);
  53         free(a->path);
  54         free(a);
  55 }
  56
  57 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  58         assert(c);
  59         assert(w);
  60
  61         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  62         free(w->path);
  63         free(w);
  64 }
  65
  66 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  67         assert(c);
  68         assert(b);
  69
  70         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  71         free(b->path);
  72         free(b);
  73 }
  74
  75 void cgroup_context_done(CGroupContext *c) {
  76         assert(c);
  77
  78         while (c->blockio_device_weights)
  79                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  80
  81         while (c->blockio_device_bandwidths)
  82                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  83
  84         while (c->device_allow)
  85                 cgroup_context_free_device_allow(c, c->device_allow);
  86 }
  87
  88 /// UNNEEDED by elogind
  89 #if 0
  90 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  91         CGroupBlockIODeviceBandwidth *b;
  92         CGroupBlockIODeviceWeight *w;
  93         CGroupDeviceAllow *a;
  94         char u[FORMAT_TIMESPAN_MAX];
  95
  96         assert(c);
  97         assert(f);
  98
  99         prefix = strempty(prefix);
 100
 101         fprintf(f,
 102                 "%sCPUAccounting=%s\n"
 103                 "%sBlockIOAccounting=%s\n"
 104                 "%sMemoryAccounting=%s\n"
 105                 "%sCPUShares=%lu\n"
 106                 "%sStartupCPUShares=%lu\n"
 107                 "%sCPUQuotaPerSecSec=%s\n"
 108                 "%sBlockIOWeight=%lu\n"
 109                 "%sStartupBlockIOWeight=%lu\n"
 110                 "%sMemoryLimit=%" PRIu64 "\n"
 111                 "%sDevicePolicy=%s\n"
 112                 "%sDelegate=%s\n",
 113                 prefix, yes_no(c->cpu_accounting),
 114                 prefix, yes_no(c->blockio_accounting),
 115                 prefix, yes_no(c->memory_accounting),
 116                 prefix, c->cpu_shares,
 117                 prefix, c->startup_cpu_shares,
 118                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 119                 prefix, c->blockio_weight,
 120                 prefix, c->startup_blockio_weight,
 121                 prefix, c->memory_limit,
 122                 prefix, cgroup_device_policy_to_string(c->device_policy),
 123                 prefix, yes_no(c->delegate));
 124
 125         LIST_FOREACH(device_allow, a, c->device_allow)
 126                 fprintf(f,
 127                         "%sDeviceAllow=%s %s%s%s\n",
 128                         prefix,
 129                         a->path,
 130                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 131
 132         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 133                 fprintf(f,
 134                         "%sBlockIODeviceWeight=%s %lu",
 135                         prefix,
 136                         w->path,
 137                         w->weight);
 138
 139         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 140                 char buf[FORMAT_BYTES_MAX];
 141
 142                 fprintf(f,
 143                         "%s%s=%s %s\n",
 144                         prefix,
 145                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 146                         b->path,
 147                         format_bytes(buf, sizeof(buf), b->bandwidth));
 148         }
 149 }
 150
 151 static int lookup_blkio_device(const char *p, dev_t *dev) {
 152         struct stat st;
 153         int r;
 154
 155         assert(p);
 156         assert(dev);
 157
 158         r = stat(p, &st);
 159         if (r < 0)
 160                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 161
 162         if (S_ISBLK(st.st_mode))
 163                 *dev = st.st_rdev;
 164         else if (major(st.st_dev) != 0) {
 165                 /* If this is not a device node then find the block
 166                  * device this file is stored on */
 167                 *dev = st.st_dev;
 168
 169                 /* If this is a partition, try to get the originating
 170                  * block device */
 171                 block_get_whole_disk(*dev, dev);
 172         } else {
 173                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 174                 return -ENODEV;
 175         }
 176
 177         return 0;
 178 }
 179
 180 static int whitelist_device(const char *path, const char *node, const char *acc) {
 181         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 182         struct stat st;
 183         int r;
 184
 185         assert(path);
 186         assert(acc);
 187
 188         if (stat(node, &st) < 0) {
 189                 log_warning("Couldn't stat device %s", node);
 190                 return -errno;
 191         }
 192
 193         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 194                 log_warning("%s is not a device.", node);
 195                 return -ENODEV;
 196         }
 197
 198         sprintf(buf,
 199                 "%c %u:%u %s",
 200                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 201                 major(st.st_rdev), minor(st.st_rdev),
 202                 acc);
 203
 204         r = cg_set_attribute("devices", path, "devices.allow", buf);
 205         if (r < 0)
 206                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 207                                "Failed to set devices.allow on %s: %m", path);
 208
 209         return r;
 210 }
 211
 212 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 213         _cleanup_fclose_ FILE *f = NULL;
 214         char line[LINE_MAX];
 215         bool good = false;
 216         int r;
 217
 218         assert(path);
 219         assert(acc);
 220         assert(type == 'b' || type == 'c');
 221
 222         f = fopen("/proc/devices", "re");
 223         if (!f)
 224                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 225
 226         FOREACH_LINE(line, f, goto fail) {
 227                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 228                 unsigned maj;
 229
 230                 truncate_nl(line);
 231
 232                 if (type == 'c' && streq(line, "Character devices:")) {
 233                         good = true;
 234                         continue;
 235                 }
 236
 237                 if (type == 'b' && streq(line, "Block devices:")) {
 238                         good = true;
 239                         continue;
 240                 }
 241
 242                 if (isempty(line)) {
 243                         good = false;
 244                         continue;
 245                 }
 246
 247                 if (!good)
 248                         continue;
 249
 250                 p = strstrip(line);
 251
 252                 w = strpbrk(p, WHITESPACE);
 253                 if (!w)
 254                         continue;
 255                 *w = 0;
 256
 257                 r = safe_atou(p, &maj);
 258                 if (r < 0)
 259                         continue;
 260                 if (maj <= 0)
 261                         continue;
 262
 263                 w++;
 264                 w += strspn(w, WHITESPACE);
 265
 266                 if (fnmatch(name, w, 0) != 0)
 267                         continue;
 268
 269                 sprintf(buf,
 270                         "%c %u:* %s",
 271                         type,
 272                         maj,
 273                         acc);
 274
 275                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 276                 if (r < 0)
 277                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 278                                        "Failed to set devices.allow on %s: %m", path);
 279         }
 280
 281         return 0;
 282
 283 fail:
 284         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 285         return -errno;
 286 }
 287
 288 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
 289         bool is_root;
 290         int r;
 291
 292         assert(c);
 293         assert(path);
 294
 295         if (mask == 0)
 296                 return;
 297
 298         /* Some cgroup attributes are not supported on the root cgroup,
 299          * hence silently ignore */
 300         is_root = isempty(path) || path_equal(path, "/");
 301         if (is_root)
 302                 /* Make sure we don't try to display messages with an empty path. */
 303                 path = "/";
 304
 305         /* We generally ignore errors caused by read-only mounted
 306          * cgroup trees (assuming we are running in a container then),
 307          * and missing cgroups, i.e. EROFS and ENOENT. */
 308
 309         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 310                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 311
 312                 sprintf(buf, "%lu\n",
 313                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 314                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 315                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 316                 if (r < 0)
 317                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 318                                        "Failed to set cpu.shares on %s: %m", path);
 319
 320                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 321                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 322                 if (r < 0)
 323                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 324                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 325
 326                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 327                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 328                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 329                 } else
 330                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 331                 if (r < 0)
 332                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 333                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 334         }
 335
 336         if (mask & CGROUP_MASK_BLKIO) {
 337                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 338                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 339                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 340                 CGroupBlockIODeviceWeight *w;
 341                 CGroupBlockIODeviceBandwidth *b;
 342
 343                 if (!is_root) {
 344                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 345                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 346                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 347                         if (r < 0)
 348                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 349                                                "Failed to set blkio.weight on %s: %m", path);
 350
 351                         /* FIXME: no way to reset this list */
 352                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 353                                 dev_t dev;
 354
 355                                 r = lookup_blkio_device(w->path, &dev);
 356                                 if (r < 0)
 357                                         continue;
 358
 359                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 360                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 361                                 if (r < 0)
 362                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 363                                                        "Failed to set blkio.weight_device on %s: %m", path);
 364                         }
 365                 }
 366
 367                 /* FIXME: no way to reset this list */
 368                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 369                         const char *a;
 370                         dev_t dev;
 371
 372                         r = lookup_blkio_device(b->path, &dev);
 373                         if (r < 0)
 374                                 continue;
 375
 376                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 377
 378                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 379                         r = cg_set_attribute("blkio", path, a, buf);
 380                         if (r < 0)
 381                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 382                                                "Failed to set %s on %s: %m", a, path);
 383                 }
 384         }
 385
 386         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 387                 if (c->memory_limit != (uint64_t) -1) {
 388                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 389
 390                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 391
 392                         if (cg_unified() <= 0)
 393                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 394                         else
 395                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 396
 397                 } else {
 398                         if (cg_unified() <= 0)
 399                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 400                         else
 401                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 402                 }
 403
 404                 if (r < 0)
 405                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 406                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 407         }
 408
 409         if ((mask & CGROUP_MASK_DEVICE) && !is_root) {
 410                 CGroupDeviceAllow *a;
 411
 412                 /* Changing the devices list of a populated cgroup
 413                  * might result in EINVAL, hence ignore EINVAL
 414                  * here. */
 415
 416                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 417                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 418                 else
 419                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 420                 if (r < 0)
 421                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 422                                        "Failed to reset devices.list on %s: %m", path);
 423
 424                 if (c->device_policy == CGROUP_CLOSED ||
 425                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 426                         static const char auto_devices[] =
 427                                 "/dev/null\0" "rwm\0"
 428                                 "/dev/zero\0" "rwm\0"
 429                                 "/dev/full\0" "rwm\0"
 430                                 "/dev/random\0" "rwm\0"
 431                                 "/dev/urandom\0" "rwm\0"
 432                                 "/dev/tty\0" "rwm\0"
 433                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 434
 435                         const char *x, *y;
 436
 437                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 438                                 whitelist_device(path, x, y);
 439
 440                         whitelist_major(path, "pts", 'c', "rw");
 441                         whitelist_major(path, "kdbus", 'c', "rw");
 442                         whitelist_major(path, "kdbus/*", 'c', "rw");
 443                 }
 444
 445                 LIST_FOREACH(device_allow, a, c->device_allow) {
 446                         char acc[4];
 447                         unsigned k = 0;
 448
 449                         if (a->r)
 450                                 acc[k++] = 'r';
 451                         if (a->w)
 452                                 acc[k++] = 'w';
 453                         if (a->m)
 454                                 acc[k++] = 'm';
 455
 456                         if (k == 0)
 457                                 continue;
 458
 459                         acc[k++] = 0;
 460
 461                         if (startswith(a->path, "/dev/"))
 462                                 whitelist_device(path, a->path, acc);
 463                         else if (startswith(a->path, "block-"))
 464                                 whitelist_major(path, a->path + 6, 'b', acc);
 465                         else if (startswith(a->path, "char-"))
 466                                 whitelist_major(path, a->path + 5, 'c', acc);
 467                         else
 468                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 469                 }
 470         }
 471 }
 472 #endif // 0
 473
 474 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 475         CGroupMask mask = 0;
 476
 477         /* Figure out which controllers we need */
 478
 479         if (c->cpu_accounting ||
 480             c->cpu_shares != (unsigned long) -1 ||
 481             c->startup_cpu_shares != (unsigned long) -1 ||
 482             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 483                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 484
 485         if (c->blockio_accounting ||
 486             c->blockio_weight != (unsigned long) -1 ||
 487             c->startup_blockio_weight != (unsigned long) -1 ||
 488             c->blockio_device_weights ||
 489             c->blockio_device_bandwidths)
 490                 mask |= CGROUP_MASK_BLKIO;
 491
 492         if (c->memory_accounting ||
 493             c->memory_limit != (uint64_t) -1)
 494                 mask |= CGROUP_MASK_MEMORY;
 495
 496         if (c->device_allow ||
 497             c->device_policy != CGROUP_AUTO)
 498                 mask |= CGROUP_MASK_DEVICE;
 499
 500         return mask;
 501 }
 502
 503 /// UNNEEDED by elogind
 504 #if 0
 505 CGroupMask unit_get_own_mask(Unit *u) {
 506         CGroupContext *c;
 507
 508         /* Returns the mask of controllers the unit needs for itself */
 509
 510         c = unit_get_cgroup_context(u);
 511         if (!c)
 512                 return 0;
 513
 514         /* If delegation is turned on, then turn on all cgroups,
 515          * unless we are on the legacy hierarchy and the process we
 516          * fork into it is known to drop privileges, and hence
 517          * shouldn't get access to the controllers.
 518          *
 519          * Note that on the unified hierarchy it is safe to delegate
 520          * controllers to unprivileged services. */
 521
 522         if (c->delegate) {
 523                 ExecContext *e;
 524
 525                 e = unit_get_exec_context(u);
 526                 if (!e ||
 527                     exec_context_maintains_privileges(e) ||
 528                     cg_unified() > 0)
 529                         return _CGROUP_MASK_ALL;
 530         }
 531
 532         return cgroup_context_get_mask(c);
 533 }
 534
 535 CGroupMask unit_get_members_mask(Unit *u) {
 536         assert(u);
 537
 538         /* Returns the mask of controllers all of the unit's children
 539          * require, merged */
 540
 541         if (u->cgroup_members_mask_valid)
 542                 return u->cgroup_members_mask;
 543
 544         u->cgroup_members_mask = 0;
 545
 546         if (u->type == UNIT_SLICE) {
 547                 Unit *member;
 548                 Iterator i;
 549
 550                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 551
 552                         if (member == u)
 553                                 continue;
 554
 555                         if (UNIT_DEREF(member->slice) != u)
 556                                 continue;
 557
 558                         u->cgroup_members_mask |=
 559                                 unit_get_own_mask(member) |
 560                                 unit_get_members_mask(member);
 561                 }
 562         }
 563
 564         u->cgroup_members_mask_valid = true;
 565         return u->cgroup_members_mask;
 566 }
 567
 568 CGroupMask unit_get_siblings_mask(Unit *u) {
 569         assert(u);
 570
 571         /* Returns the mask of controllers all of the unit's siblings
 572          * require, i.e. the members mask of the unit's parent slice
 573          * if there is one. */
 574
 575         if (UNIT_ISSET(u->slice))
 576                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 577
 578         return unit_get_own_mask(u) | unit_get_members_mask(u);
 579 }
 580
 581 CGroupMask unit_get_subtree_mask(Unit *u) {
 582
 583         /* Returns the mask of this subtree, meaning of the group
 584          * itself and its children. */
 585
 586         return unit_get_own_mask(u) | unit_get_members_mask(u);
 587 }
 588
 589 CGroupMask unit_get_target_mask(Unit *u) {
 590         CGroupMask mask;
 591
 592         /* This returns the cgroup mask of all controllers to enable
 593          * for a specific cgroup, i.e. everything it needs itself,
 594          * plus all that its children need, plus all that its siblings
 595          * need. This is primarily useful on the legacy cgroup
 596          * hierarchy, where we need to duplicate each cgroup in each
 597          * hierarchy that shall be enabled for it. */
 598
 599         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 600         mask &= u->manager->cgroup_supported;
 601
 602         return mask;
 603 }
 604
 605 CGroupMask unit_get_enable_mask(Unit *u) {
 606         CGroupMask mask;
 607
 608         /* This returns the cgroup mask of all controllers to enable
 609          * for the children of a specific cgroup. This is primarily
 610          * useful for the unified cgroup hierarchy, where each cgroup
 611          * controls which controllers are enabled for its children. */
 612
 613         mask = unit_get_members_mask(u);
 614         mask &= u->manager->cgroup_supported;
 615
 616         return mask;
 617 }
 618
 619 /* Recurse from a unit up through its containing slices, propagating
 620  * mask bits upward. A unit is also member of itself. */
 621 void unit_update_cgroup_members_masks(Unit *u) {
 622         CGroupMask m;
 623         bool more;
 624
 625         assert(u);
 626
 627         /* Calculate subtree mask */
 628         m = unit_get_subtree_mask(u);
 629
 630         /* See if anything changed from the previous invocation. If
 631          * not, we're done. */
 632         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 633                 return;
 634
 635         more =
 636                 u->cgroup_subtree_mask_valid &&
 637                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 638                 ((~m & u->cgroup_subtree_mask) == 0);
 639
 640         u->cgroup_subtree_mask = m;
 641         u->cgroup_subtree_mask_valid = true;
 642
 643         if (UNIT_ISSET(u->slice)) {
 644                 Unit *s = UNIT_DEREF(u->slice);
 645
 646                 if (more)
 647                         /* There's more set now than before. We
 648                          * propagate the new mask to the parent's mask
 649                          * (not caring if it actually was valid or
 650                          * not). */
 651
 652                         s->cgroup_members_mask |= m;
 653
 654                 else
 655                         /* There's less set now than before (or we
 656                          * don't know), we need to recalculate
 657                          * everything, so let's invalidate the
 658                          * parent's members mask */
 659
 660                         s->cgroup_members_mask_valid = false;
 661
 662                 /* And now make sure that this change also hits our
 663                  * grandparents */
 664                 unit_update_cgroup_members_masks(s);
 665         }
 666 }
 667
 668 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 669         Unit *u = userdata;
 670
 671         assert(mask != 0);
 672         assert(u);
 673
 674         while (u) {
 675                 if (u->cgroup_path &&
 676                     u->cgroup_realized &&
 677                     (u->cgroup_realized_mask & mask) == mask)
 678                         return u->cgroup_path;
 679
 680                 u = UNIT_DEREF(u->slice);
 681         }
 682
 683         return NULL;
 684 }
 685
 686 char *unit_default_cgroup_path(Unit *u) {
 687         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 688         int r;
 689
 690         assert(u);
 691
 692         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 693                 return strdup(u->manager->cgroup_root);
 694
 695         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 696                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 697                 if (r < 0)
 698                         return NULL;
 699         }
 700
 701         escaped = cg_escape(u->id);
 702         if (!escaped)
 703                 return NULL;
 704
 705         if (slice)
 706                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 707         else
 708                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 709 }
 710
 711 int unit_set_cgroup_path(Unit *u, const char *path) {
 712         _cleanup_free_ char *p = NULL;
 713         int r;
 714
 715         assert(u);
 716
 717         if (path) {
 718                 p = strdup(path);
 719                 if (!p)
 720                         return -ENOMEM;
 721         } else
 722                 p = NULL;
 723
 724         if (streq_ptr(u->cgroup_path, p))
 725                 return 0;
 726
 727         if (p) {
 728                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 729                 if (r < 0)
 730                         return r;
 731         }
 732
 733         unit_release_cgroup(u);
 734
 735         u->cgroup_path = p;
 736         p = NULL;
 737
 738         return 1;
 739 }
 740
 741 int unit_watch_cgroup(Unit *u) {
 742         _cleanup_free_ char *populated = NULL;
 743         int r;
 744
 745         assert(u);
 746
 747         if (!u->cgroup_path)
 748                 return 0;
 749
 750         if (u->cgroup_inotify_wd >= 0)
 751                 return 0;
 752
 753         /* Only applies to the unified hierarchy */
 754         r = cg_unified();
 755         if (r < 0)
 756                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 757         if (r == 0)
 758                 return 0;
 759
 760         /* Don't watch the root slice, it's pointless. */
 761         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 762                 return 0;
 763
 764         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 765         if (r < 0)
 766                 return log_oom();
 767
 768         r = cg_get_path(ELOGIND_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 769         if (r < 0)
 770                 return log_oom();
 771
 772         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 773         if (u->cgroup_inotify_wd < 0) {
 774
 775                 if (errno == ENOENT) /* If the directory is already
 776                                       * gone we don't need to track
 777                                       * it, so this is not an error */
 778                         return 0;
 779
 780                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 781         }
 782
 783         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 784         if (r < 0)
 785                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 786
 787         return 0;
 788 }
 789
 790 static int unit_create_cgroup(
 791                 Unit *u,
 792                 CGroupMask target_mask,
 793                 CGroupMask enable_mask) {
 794
 795         CGroupContext *c;
 796         int r;
 797
 798         assert(u);
 799
 800         c = unit_get_cgroup_context(u);
 801         if (!c)
 802                 return 0;
 803
 804         if (!u->cgroup_path) {
 805                 _cleanup_free_ char *path = NULL;
 806
 807                 path = unit_default_cgroup_path(u);
 808                 if (!path)
 809                         return log_oom();
 810
 811                 r = unit_set_cgroup_path(u, path);
 812                 if (r == -EEXIST)
 813                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 814                 if (r < 0)
 815                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 816         }
 817
 818         /* First, create our own group */
 819         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 820         if (r < 0)
 821                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 822
 823         /* Start watching it */
 824         (void) unit_watch_cgroup(u);
 825
 826         /* Enable all controllers we need */
 827         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 828         if (r < 0)
 829                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 830
 831         /* Keep track that this is now realized */
 832         u->cgroup_realized = true;
 833         u->cgroup_realized_mask = target_mask;
 834
 835         if (u->type != UNIT_SLICE && !c->delegate) {
 836
 837                 /* Then, possibly move things over, but not if
 838                  * subgroups may contain processes, which is the case
 839                  * for slice and delegation units. */
 840                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 841                 if (r < 0)
 842                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 843         }
 844
 845         return 0;
 846 }
 847
 848 int unit_attach_pids_to_cgroup(Unit *u) {
 849         int r;
 850         assert(u);
 851
 852         r = unit_realize_cgroup(u);
 853         if (r < 0)
 854                 return r;
 855
 856         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 857         if (r < 0)
 858                 return r;
 859
 860         return 0;
 861 }
 862
 863 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 864         assert(u);
 865
 866         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 867 }
 868
 869 /* Check if necessary controllers and attributes for a unit are in place.
 870  *
 871  * If so, do nothing.
 872  * If not, create paths, move processes over, and set attributes.
 873  *
 874  * Returns 0 on success and < 0 on failure. */
 875 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 876         CGroupMask target_mask, enable_mask;
 877         int r;
 878
 879         assert(u);
 880
 881         if (u->in_cgroup_queue) {
 882                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 883                 u->in_cgroup_queue = false;
 884         }
 885
 886         target_mask = unit_get_target_mask(u);
 887         if (unit_has_mask_realized(u, target_mask))
 888                 return 0;
 889
 890         /* First, realize parents */
 891         if (UNIT_ISSET(u->slice)) {
 892                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 893                 if (r < 0)
 894                         return r;
 895         }
 896
 897         /* And then do the real work */
 898         enable_mask = unit_get_enable_mask(u);
 899         r = unit_create_cgroup(u, target_mask, enable_mask);
 900         if (r < 0)
 901                 return r;
 902
 903         /* Finally, apply the necessary attributes. */
 904         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
 905
 906         return 0;
 907 }
 908
 909 static void unit_add_to_cgroup_queue(Unit *u) {
 910
 911         if (u->in_cgroup_queue)
 912                 return;
 913
 914         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 915         u->in_cgroup_queue = true;
 916 }
 917
 918 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 919         ManagerState state;
 920         unsigned n = 0;
 921         Unit *i;
 922         int r;
 923
 924         state = manager_state(m);
 925
 926         while ((i = m->cgroup_queue)) {
 927                 assert(i->in_cgroup_queue);
 928
 929                 r = unit_realize_cgroup_now(i, state);
 930                 if (r < 0)
 931                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
 932
 933                 n++;
 934         }
 935
 936         return n;
 937 }
 938
 939 static void unit_queue_siblings(Unit *u) {
 940         Unit *slice;
 941
 942         /* This adds the siblings of the specified unit and the
 943          * siblings of all parent units to the cgroup queue. (But
 944          * neither the specified unit itself nor the parents.) */
 945
 946         while ((slice = UNIT_DEREF(u->slice))) {
 947                 Iterator i;
 948                 Unit *m;
 949
 950                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 951                         if (m == u)
 952                                 continue;
 953
 954                         /* Skip units that have a dependency on the slice
 955                          * but aren't actually in it. */
 956                         if (UNIT_DEREF(m->slice) != slice)
 957                                 continue;
 958
 959                         /* No point in doing cgroup application for units
 960                          * without active processes. */
 961                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 962                                 continue;
 963
 964                         /* If the unit doesn't need any new controllers
 965                          * and has current ones realized, it doesn't need
 966                          * any changes. */
 967                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 968                                 continue;
 969
 970                         unit_add_to_cgroup_queue(m);
 971                 }
 972
 973                 u = slice;
 974         }
 975 }
 976
 977 int unit_realize_cgroup(Unit *u) {
 978         assert(u);
 979
 980         if (!UNIT_HAS_CGROUP_CONTEXT(u))
 981                 return 0;
 982
 983         /* So, here's the deal: when realizing the cgroups for this
 984          * unit, we need to first create all parents, but there's more
 985          * actually: for the weight-based controllers we also need to
 986          * make sure that all our siblings (i.e. units that are in the
 987          * same slice as we are) have cgroups, too. Otherwise, things
 988          * would become very uneven as each of their processes would
 989          * get as much resources as all our group together. This call
 990          * will synchronously create the parent cgroups, but will
 991          * defer work on the siblings to the next event loop
 992          * iteration. */
 993
 994         /* Add all sibling slices to the cgroup queue. */
 995         unit_queue_siblings(u);
 996
 997         /* And realize this one now (and apply the values) */
 998         return unit_realize_cgroup_now(u, manager_state(u->manager));
 999 }
1000
1001 void unit_release_cgroup(Unit *u) {
1002         assert(u);
1003
1004         /* Forgets all cgroup details for this cgroup */
1005
1006         if (u->cgroup_path) {
1007                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1008                 u->cgroup_path = mfree(u->cgroup_path);
1009         }
1010
1011         if (u->cgroup_inotify_wd >= 0) {
1012                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1013                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1014
1015                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1016                 u->cgroup_inotify_wd = -1;
1017         }
1018 }
1019
1020 void unit_prune_cgroup(Unit *u) {
1021         int r;
1022         bool is_root_slice;
1023
1024         assert(u);
1025
1026         /* Removes the cgroup, if empty and possible, and stops watching it. */
1027
1028         if (!u->cgroup_path)
1029                 return;
1030
1031         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1032
1033         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1034         if (r < 0) {
1035                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1036                 return;
1037         }
1038
1039         if (is_root_slice)
1040                 return;
1041
1042         unit_release_cgroup(u);
1043
1044         u->cgroup_realized = false;
1045         u->cgroup_realized_mask = 0;
1046 }
1047
1048 int unit_search_main_pid(Unit *u, pid_t *ret) {
1049         _cleanup_fclose_ FILE *f = NULL;
1050         pid_t pid = 0, npid, mypid;
1051         int r;
1052
1053         assert(u);
1054         assert(ret);
1055
1056         if (!u->cgroup_path)
1057                 return -ENXIO;
1058
1059         r = cg_enumerate_processes(ELOGIND_CGROUP_CONTROLLER, u->cgroup_path, &f);
1060         if (r < 0)
1061                 return r;
1062
1063         mypid = getpid();
1064         while (cg_read_pid(f, &npid) > 0)  {
1065                 pid_t ppid;
1066
1067                 if (npid == pid)
1068                         continue;
1069
1070                 /* Ignore processes that aren't our kids */
1071                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1072                         continue;
1073
1074                 if (pid != 0)
1075                         /* Dang, there's more than one daemonized PID
1076                         in this group, so we don't know what process
1077                         is the main process. */
1078
1079                         return -ENODATA;
1080
1081                 pid = npid;
1082         }
1083
1084         *ret = pid;
1085         return 0;
1086 }
1087
1088 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1089         _cleanup_closedir_ DIR *d = NULL;
1090         _cleanup_fclose_ FILE *f = NULL;
1091         int ret = 0, r;
1092
1093         assert(u);
1094         assert(path);
1095
1096         r = cg_enumerate_processes(ELOGIND_CGROUP_CONTROLLER, path, &f);
1097         if (r < 0)
1098                 ret = r;
1099         else {
1100                 pid_t pid;
1101
1102                 while ((r = cg_read_pid(f, &pid)) > 0) {
1103                         r = unit_watch_pid(u, pid);
1104                         if (r < 0 && ret >= 0)
1105                                 ret = r;
1106                 }
1107
1108                 if (r < 0 && ret >= 0)
1109                         ret = r;
1110         }
1111
1112         r = cg_enumerate_subgroups(ELOGIND_CGROUP_CONTROLLER, path, &d);
1113         if (r < 0) {
1114                 if (ret >= 0)
1115                         ret = r;
1116         } else {
1117                 char *fn;
1118
1119                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1120                         _cleanup_free_ char *p = NULL;
1121
1122                         p = strjoin(path, "/", fn, NULL);
1123                         free(fn);
1124
1125                         if (!p)
1126                                 return -ENOMEM;
1127
1128                         r = unit_watch_pids_in_path(u, p);
1129                         if (r < 0 && ret >= 0)
1130                                 ret = r;
1131                 }
1132
1133                 if (r < 0 && ret >= 0)
1134                         ret = r;
1135         }
1136
1137         return ret;
1138 }
1139
1140 int unit_watch_all_pids(Unit *u) {
1141         assert(u);
1142
1143         /* Adds all PIDs from our cgroup to the set of PIDs we
1144          * watch. This is a fallback logic for cases where we do not
1145          * get reliable cgroup empty notifications: we try to use
1146          * SIGCHLD as replacement. */
1147
1148         if (!u->cgroup_path)
1149                 return -ENOENT;
1150
1151         if (cg_unified() > 0) /* On unified we can use proper notifications */
1152                 return 0;
1153
1154         return unit_watch_pids_in_path(u, u->cgroup_path);
1155 }
1156
1157 int unit_notify_cgroup_empty(Unit *u) {
1158         int r;
1159
1160         assert(u);
1161
1162         if (!u->cgroup_path)
1163                 return 0;
1164
1165         r = cg_is_empty_recursive(ELOGIND_CGROUP_CONTROLLER, u->cgroup_path);
1166         if (r <= 0)
1167                 return r;
1168
1169         unit_add_to_gc_queue(u);
1170
1171         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1172                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1173
1174         return 0;
1175 }
1176
1177 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1178         Manager *m = userdata;
1179
1180         assert(s);
1181         assert(fd >= 0);
1182         assert(m);
1183
1184         for (;;) {
1185                 union inotify_event_buffer buffer;
1186                 struct inotify_event *e;
1187                 ssize_t l;
1188
1189                 l = read(fd, &buffer, sizeof(buffer));
1190                 if (l < 0) {
1191                         if (errno == EINTR || errno == EAGAIN)
1192                                 return 0;
1193
1194                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1195                 }
1196
1197                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1198                         Unit *u;
1199
1200                         if (e->wd < 0)
1201                                 /* Queue overflow has no watch descriptor */
1202                                 continue;
1203
1204                         if (e->mask & IN_IGNORED)
1205                                 /* The watch was just removed */
1206                                 continue;
1207
1208                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1209                         if (!u) /* Not that inotify might deliver
1210                                  * events for a watch even after it
1211                                  * was removed, because it was queued
1212                                  * before the removal. Let's ignore
1213                                  * this here safely. */
1214                                 continue;
1215
1216                         (void) unit_notify_cgroup_empty(u);
1217                 }
1218         }
1219 }
1220 #endif // 0
1221
1222 int manager_setup_cgroup(Manager *m) {
1223         _cleanup_free_ char *path = NULL;
1224         CGroupController c;
1225         int r, unified;
1226         char *e;
1227
1228         assert(m);
1229
1230         /* 1. Determine hierarchy */
1231         m->cgroup_root = mfree(m->cgroup_root);
1232         r = cg_pid_get_path(ELOGIND_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1233         if (r < 0)
1234                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1235
1236         /* Chop off the init scope, if we are already located in it */
1237         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1238
1239         /* LEGACY: Also chop off the system slice if we are in
1240          * it. This is to support live upgrades from older systemd
1241          * versions where PID 1 was moved there. Also see
1242          * cg_get_root_path(). */
1243         if (!e) {
1244                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1245                 if (!e)
1246                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1247         }
1248         if (e)
1249                 *e = 0;
1250
1251         /* And make sure to store away the root value without trailing
1252          * slash, even for the root dir, so that we can easily prepend
1253          * it everywhere. */
1254         while ((e = endswith(m->cgroup_root, "/")))
1255                 *e = 0;
1256
1257         /* 2. Show data */
1258         r = cg_get_path(ELOGIND_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1259         if (r < 0)
1260                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1261
1262         unified = cg_unified();
1263         if (unified < 0)
1264                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1265         if (unified > 0)
1266                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1267         else
1268                 log_debug("Using cgroup controller " ELOGIND_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1269
1270         if (!m->test_run) {
1271                 const char *scope_path;
1272
1273                 /* 3. Install agent */
1274                 if (unified) {
1275
1276                         /* In the unified hierarchy we can can get
1277                          * cgroup empty notifications via inotify. */
1278 /// elogind does not support the unified hierarchy, yet.
1279 #if 0
1280                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1281                         safe_close(m->cgroup_inotify_fd);
1282
1283                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1284                         if (m->cgroup_inotify_fd < 0)
1285                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1286
1287                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1288                         if (r < 0)
1289                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1290
1291                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1292                         if (r < 0)
1293                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1294
1295                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1296
1297 #else
1298                         return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1299 #endif // elogind
1300                 } else if (m->running_as == MANAGER_SYSTEM) {
1301                         /* On the legacy hierarchy we only get
1302                          * notifications via cgroup agents. (Which
1303                          * isn't really reliable, since it does not
1304                          * generate events when control groups with
1305                          * children run empty. */
1306
1307                         r = cg_install_release_agent(ELOGIND_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1308                         if (r < 0)
1309                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1310                         else if (r > 0)
1311                                 log_debug("Installed release agent.");
1312                         else if (r == 0)
1313                                 log_debug("Release agent already installed.");
1314                 }
1315
1316                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1317                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1318                 r = cg_create_and_attach(ELOGIND_CGROUP_CONTROLLER, scope_path, 0);
1319                 if (r < 0)
1320                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1321
1322                 /* also, move all other userspace processes remaining
1323                  * in the root cgroup into that scope. */
1324                 r = cg_migrate(ELOGIND_CGROUP_CONTROLLER, m->cgroup_root, ELOGIND_CGROUP_CONTROLLER, scope_path, false);
1325                 if (r < 0)
1326                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1327
1328                 /* 5. And pin it, so that it cannot be unmounted */
1329                 safe_close(m->pin_cgroupfs_fd);
1330                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1331                 if (m->pin_cgroupfs_fd < 0)
1332                         return log_error_errno(errno, "Failed to open pin file: %m");
1333
1334                 /* 6.  Always enable hierarchical support if it exists... */
1335                 if (!unified)
1336                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1337         }
1338
1339         /* 7. Figure out which controllers are supported */
1340         r = cg_mask_supported(&m->cgroup_supported);
1341         if (r < 0)
1342                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1343
1344         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1345                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1346
1347         return 0;
1348 }
1349
1350 void manager_shutdown_cgroup(Manager *m, bool delete) {
1351         assert(m);
1352
1353         /* We can't really delete the group, since we are in it. But
1354          * let's trim it. */
1355         if (delete && m->cgroup_root)
1356                 (void) cg_trim(ELOGIND_CGROUP_CONTROLLER, m->cgroup_root, false);
1357
1358 /// elogind does not support the unified hierarchy, yet.
1359 #if 0
1360         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1361
1362         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1363         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1364 #endif // 0
1365
1366         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1367
1368         m->cgroup_root = mfree(m->cgroup_root);
1369 }
1370
1371 /// UNNEEDED by elogind
1372 #if 0
1373 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1374         char *p;
1375         Unit *u;
1376
1377         assert(m);
1378         assert(cgroup);
1379
1380         u = hashmap_get(m->cgroup_unit, cgroup);
1381         if (u)
1382                 return u;
1383
1384         p = strdupa(cgroup);
1385         for (;;) {
1386                 char *e;
1387
1388                 e = strrchr(p, '/');
1389                 if (!e || e == p)
1390                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1391
1392                 *e = 0;
1393
1394                 u = hashmap_get(m->cgroup_unit, p);
1395                 if (u)
1396                         return u;
1397         }
1398 }
1399
1400 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1401         _cleanup_free_ char *cgroup = NULL;
1402         int r;
1403
1404         assert(m);
1405
1406         if (pid <= 0)
1407                 return NULL;
1408
1409         r = cg_pid_get_path(ELOGIND_CGROUP_CONTROLLER, pid, &cgroup);
1410         if (r < 0)
1411                 return NULL;
1412
1413         return manager_get_unit_by_cgroup(m, cgroup);
1414 }
1415
1416 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1417         Unit *u;
1418
1419         assert(m);
1420
1421         if (pid <= 0)
1422                 return NULL;
1423
1424         if (pid == 1)
1425                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1426
1427         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1428         if (u)
1429                 return u;
1430
1431         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1432         if (u)
1433                 return u;
1434
1435         return manager_get_unit_by_pid_cgroup(m, pid);
1436 }
1437
1438 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1439         Unit *u;
1440
1441         assert(m);
1442         assert(cgroup);
1443
1444         u = manager_get_unit_by_cgroup(m, cgroup);
1445         if (!u)
1446                 return 0;
1447
1448         return unit_notify_cgroup_empty(u);
1449 }
1450
1451 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1452         _cleanup_free_ char *v = NULL;
1453         int r;
1454
1455         assert(u);
1456         assert(ret);
1457
1458         if (!u->cgroup_path)
1459                 return -ENODATA;
1460
1461         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1462                 return -ENODATA;
1463
1464         if (cg_unified() <= 0)
1465                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1466         else
1467                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1468         if (r == -ENOENT)
1469                 return -ENODATA;
1470         if (r < 0)
1471                 return r;
1472
1473         return safe_atou64(v, ret);
1474 }
1475
1476 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1477         _cleanup_free_ char *v = NULL;
1478         uint64_t ns;
1479         int r;
1480
1481         assert(u);
1482         assert(ret);
1483
1484         if (!u->cgroup_path)
1485                 return -ENODATA;
1486
1487         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1488                 return -ENODATA;
1489
1490         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1491         if (r == -ENOENT)
1492                 return -ENODATA;
1493         if (r < 0)
1494                 return r;
1495
1496         r = safe_atou64(v, &ns);
1497         if (r < 0)
1498                 return r;
1499
1500         *ret = ns;
1501         return 0;
1502 }
1503
1504 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1505         nsec_t ns;
1506         int r;
1507
1508         r = unit_get_cpu_usage_raw(u, &ns);
1509         if (r < 0)
1510                 return r;
1511
1512         if (ns > u->cpuacct_usage_base)
1513                 ns -= u->cpuacct_usage_base;
1514         else
1515                 ns = 0;
1516
1517         *ret = ns;
1518         return 0;
1519 }
1520
1521 int unit_reset_cpu_usage(Unit *u) {
1522         nsec_t ns;
1523         int r;
1524
1525         assert(u);
1526
1527         r = unit_get_cpu_usage_raw(u, &ns);
1528         if (r < 0) {
1529                 u->cpuacct_usage_base = 0;
1530                 return r;
1531         }
1532
1533         u->cpuacct_usage_base = ns;
1534         return 0;
1535 }
1536
1537 bool unit_cgroup_delegate(Unit *u) {
1538         CGroupContext *c;
1539
1540         assert(u);
1541
1542         c = unit_get_cgroup_context(u);
1543         if (!c)
1544                 return false;
1545
1546         return c->delegate;
1547 }
1548 #endif // 0
1549
1550 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1551         [CGROUP_AUTO] = "auto",
1552         [CGROUP_CLOSED] = "closed",
1553         [CGROUP_STRICT] = "strict",
1554 };
1555
1556 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);