src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "process-util.h"
  26 #include "path-util.h"
  27 // #include "special.h"
  28 #include "cgroup-util.h"
  29 #include "cgroup.h"
  30
  31 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  32
  33 // UNNEEDED by elogind
  34 #if 0
  35 void cgroup_context_init(CGroupContext *c) {
  36         assert(c);
  37
  38         /* Initialize everything to the kernel defaults, assuming the
  39          * structure is preinitialized to 0 */
  40
  41         c->cpu_shares = (unsigned long) -1;
  42         c->startup_cpu_shares = (unsigned long) -1;
  43         c->memory_limit = (uint64_t) -1;
  44         c->blockio_weight = (unsigned long) -1;
  45         c->startup_blockio_weight = (unsigned long) -1;
  46
  47         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  48 }
  49
  50 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  51         assert(c);
  52         assert(a);
  53
  54         LIST_REMOVE(device_allow, c->device_allow, a);
  55         free(a->path);
  56         free(a);
  57 }
  58
  59 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  60         assert(c);
  61         assert(w);
  62
  63         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  64         free(w->path);
  65         free(w);
  66 }
  67
  68 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  69         assert(c);
  70         assert(b);
  71
  72         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  73         free(b->path);
  74         free(b);
  75 }
  76
  77 void cgroup_context_done(CGroupContext *c) {
  78         assert(c);
  79
  80         while (c->blockio_device_weights)
  81                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  82
  83         while (c->blockio_device_bandwidths)
  84                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  85
  86         while (c->device_allow)
  87                 cgroup_context_free_device_allow(c, c->device_allow);
  88 }
  89
  90 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  91         CGroupBlockIODeviceBandwidth *b;
  92         CGroupBlockIODeviceWeight *w;
  93         CGroupDeviceAllow *a;
  94         char u[FORMAT_TIMESPAN_MAX];
  95
  96         assert(c);
  97         assert(f);
  98
  99         prefix = strempty(prefix);
 100
 101         fprintf(f,
 102                 "%sCPUAccounting=%s\n"
 103                 "%sBlockIOAccounting=%s\n"
 104                 "%sMemoryAccounting=%s\n"
 105                 "%sCPUShares=%lu\n"
 106                 "%sStartupCPUShares=%lu\n"
 107                 "%sCPUQuotaPerSecSec=%s\n"
 108                 "%sBlockIOWeight=%lu\n"
 109                 "%sStartupBlockIOWeight=%lu\n"
 110                 "%sMemoryLimit=%" PRIu64 "\n"
 111                 "%sDevicePolicy=%s\n"
 112                 "%sDelegate=%s\n",
 113                 prefix, yes_no(c->cpu_accounting),
 114                 prefix, yes_no(c->blockio_accounting),
 115                 prefix, yes_no(c->memory_accounting),
 116                 prefix, c->cpu_shares,
 117                 prefix, c->startup_cpu_shares,
 118                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 119                 prefix, c->blockio_weight,
 120                 prefix, c->startup_blockio_weight,
 121                 prefix, c->memory_limit,
 122                 prefix, cgroup_device_policy_to_string(c->device_policy),
 123                 prefix, yes_no(c->delegate));
 124
 125         LIST_FOREACH(device_allow, a, c->device_allow)
 126                 fprintf(f,
 127                         "%sDeviceAllow=%s %s%s%s\n",
 128                         prefix,
 129                         a->path,
 130                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 131
 132         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 133                 fprintf(f,
 134                         "%sBlockIODeviceWeight=%s %lu",
 135                         prefix,
 136                         w->path,
 137                         w->weight);
 138
 139         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 140                 char buf[FORMAT_BYTES_MAX];
 141
 142                 fprintf(f,
 143                         "%s%s=%s %s\n",
 144                         prefix,
 145                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 146                         b->path,
 147                         format_bytes(buf, sizeof(buf), b->bandwidth));
 148         }
 149 }
 150
 151 static int lookup_blkio_device(const char *p, dev_t *dev) {
 152         struct stat st;
 153         int r;
 154
 155         assert(p);
 156         assert(dev);
 157
 158         r = stat(p, &st);
 159         if (r < 0)
 160                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 161
 162         if (S_ISBLK(st.st_mode))
 163                 *dev = st.st_rdev;
 164         else if (major(st.st_dev) != 0) {
 165                 /* If this is not a device node then find the block
 166                  * device this file is stored on */
 167                 *dev = st.st_dev;
 168
 169                 /* If this is a partition, try to get the originating
 170                  * block device */
 171                 block_get_whole_disk(*dev, dev);
 172         } else {
 173                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 174                 return -ENODEV;
 175         }
 176
 177         return 0;
 178 }
 179
 180 static int whitelist_device(const char *path, const char *node, const char *acc) {
 181         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 182         struct stat st;
 183         int r;
 184
 185         assert(path);
 186         assert(acc);
 187
 188         if (stat(node, &st) < 0) {
 189                 log_warning("Couldn't stat device %s", node);
 190                 return -errno;
 191         }
 192
 193         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 194                 log_warning("%s is not a device.", node);
 195                 return -ENODEV;
 196         }
 197
 198         sprintf(buf,
 199                 "%c %u:%u %s",
 200                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 201                 major(st.st_rdev), minor(st.st_rdev),
 202                 acc);
 203
 204         r = cg_set_attribute("devices", path, "devices.allow", buf);
 205         if (r < 0)
 206                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 207                                "Failed to set devices.allow on %s: %m", path);
 208
 209         return r;
 210 }
 211
 212 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 213         _cleanup_fclose_ FILE *f = NULL;
 214         char line[LINE_MAX];
 215         bool good = false;
 216         int r;
 217
 218         assert(path);
 219         assert(acc);
 220         assert(type == 'b' || type == 'c');
 221
 222         f = fopen("/proc/devices", "re");
 223         if (!f)
 224                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 225
 226         FOREACH_LINE(line, f, goto fail) {
 227                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 228                 unsigned maj;
 229
 230                 truncate_nl(line);
 231
 232                 if (type == 'c' && streq(line, "Character devices:")) {
 233                         good = true;
 234                         continue;
 235                 }
 236
 237                 if (type == 'b' && streq(line, "Block devices:")) {
 238                         good = true;
 239                         continue;
 240                 }
 241
 242                 if (isempty(line)) {
 243                         good = false;
 244                         continue;
 245                 }
 246
 247                 if (!good)
 248                         continue;
 249
 250                 p = strstrip(line);
 251
 252                 w = strpbrk(p, WHITESPACE);
 253                 if (!w)
 254                         continue;
 255                 *w = 0;
 256
 257                 r = safe_atou(p, &maj);
 258                 if (r < 0)
 259                         continue;
 260                 if (maj <= 0)
 261                         continue;
 262
 263                 w++;
 264                 w += strspn(w, WHITESPACE);
 265
 266                 if (fnmatch(name, w, 0) != 0)
 267                         continue;
 268
 269                 sprintf(buf,
 270                         "%c %u:* %s",
 271                         type,
 272                         maj,
 273                         acc);
 274
 275                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 276                 if (r < 0)
 277                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 278                                        "Failed to set devices.allow on %s: %m", path);
 279         }
 280
 281         return 0;
 282
 283 fail:
 284         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 285         return -errno;
 286 }
 287
 288 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
 289         bool is_root;
 290         int r;
 291
 292         assert(c);
 293         assert(path);
 294
 295         if (mask == 0)
 296                 return;
 297
 298         /* Some cgroup attributes are not supported on the root cgroup,
 299          * hence silently ignore */
 300         is_root = isempty(path) || path_equal(path, "/");
 301         if (is_root)
 302                 /* Make sure we don't try to display messages with an empty path. */
 303                 path = "/";
 304
 305         /* We generally ignore errors caused by read-only mounted
 306          * cgroup trees (assuming we are running in a container then),
 307          * and missing cgroups, i.e. EROFS and ENOENT. */
 308
 309         if ((mask & CGROUP_MASK_CPU) && !is_root) {
 310                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 311
 312                 sprintf(buf, "%lu\n",
 313                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 314                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 315                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 316                 if (r < 0)
 317                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 318                                        "Failed to set cpu.shares on %s: %m", path);
 319
 320                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 321                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 322                 if (r < 0)
 323                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 324                                        "Failed to set cpu.cfs_period_us on %s: %m", path);
 325
 326                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 327                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 328                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 329                 } else
 330                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 331                 if (r < 0)
 332                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 333                                        "Failed to set cpu.cfs_quota_us on %s: %m", path);
 334         }
 335
 336         if (mask & CGROUP_MASK_BLKIO) {
 337                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 338                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 339                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 340                 CGroupBlockIODeviceWeight *w;
 341                 CGroupBlockIODeviceBandwidth *b;
 342
 343                 if (!is_root) {
 344                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 345                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 346                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 347                         if (r < 0)
 348                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 349                                                "Failed to set blkio.weight on %s: %m", path);
 350
 351                         /* FIXME: no way to reset this list */
 352                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 353                                 dev_t dev;
 354
 355                                 r = lookup_blkio_device(w->path, &dev);
 356                                 if (r < 0)
 357                                         continue;
 358
 359                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 360                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 361                                 if (r < 0)
 362                                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 363                                                        "Failed to set blkio.weight_device on %s: %m", path);
 364                         }
 365                 }
 366
 367                 /* FIXME: no way to reset this list */
 368                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 369                         const char *a;
 370                         dev_t dev;
 371
 372                         r = lookup_blkio_device(b->path, &dev);
 373                         if (r < 0)
 374                                 continue;
 375
 376                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 377
 378                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 379                         r = cg_set_attribute("blkio", path, a, buf);
 380                         if (r < 0)
 381                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 382                                                "Failed to set %s on %s: %m", a, path);
 383                 }
 384         }
 385
 386         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
 387                 if (c->memory_limit != (uint64_t) -1) {
 388                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 389
 390                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 391
 392                         if (cg_unified() <= 0)
 393                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 394                         else
 395                                 r = cg_set_attribute("memory", path, "memory.max", buf);
 396
 397                 } else {
 398                         if (cg_unified() <= 0)
 399                                 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 400                         else
 401                                 r = cg_set_attribute("memory", path, "memory.max", "max");
 402                 }
 403
 404                 if (r < 0)
 405                         log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
 406                                        "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
 407         }
 408
 409         if ((mask & CGROUP_MASK_DEVICE) && !is_root) {
 410                 CGroupDeviceAllow *a;
 411
 412                 /* Changing the devices list of a populated cgroup
 413                  * might result in EINVAL, hence ignore EINVAL
 414                  * here. */
 415
 416                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 417                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 418                 else
 419                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 420                 if (r < 0)
 421                         log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
 422                                        "Failed to reset devices.list on %s: %m", path);
 423
 424                 if (c->device_policy == CGROUP_CLOSED ||
 425                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 426                         static const char auto_devices[] =
 427                                 "/dev/null\0" "rwm\0"
 428                                 "/dev/zero\0" "rwm\0"
 429                                 "/dev/full\0" "rwm\0"
 430                                 "/dev/random\0" "rwm\0"
 431                                 "/dev/urandom\0" "rwm\0"
 432                                 "/dev/tty\0" "rwm\0"
 433                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 434
 435                         const char *x, *y;
 436
 437                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 438                                 whitelist_device(path, x, y);
 439
 440                         whitelist_major(path, "pts", 'c', "rw");
 441                         whitelist_major(path, "kdbus", 'c', "rw");
 442                         whitelist_major(path, "kdbus/*", 'c', "rw");
 443                 }
 444
 445                 LIST_FOREACH(device_allow, a, c->device_allow) {
 446                         char acc[4];
 447                         unsigned k = 0;
 448
 449                         if (a->r)
 450                                 acc[k++] = 'r';
 451                         if (a->w)
 452                                 acc[k++] = 'w';
 453                         if (a->m)
 454                                 acc[k++] = 'm';
 455
 456                         if (k == 0)
 457                                 continue;
 458
 459                         acc[k++] = 0;
 460
 461                         if (startswith(a->path, "/dev/"))
 462                                 whitelist_device(path, a->path, acc);
 463                         else if (startswith(a->path, "block-"))
 464                                 whitelist_major(path, a->path + 6, 'b', acc);
 465                         else if (startswith(a->path, "char-"))
 466                                 whitelist_major(path, a->path + 5, 'c', acc);
 467                         else
 468                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 469                 }
 470         }
 471 }
 472
 473 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
 474         CGroupMask mask = 0;
 475
 476         /* Figure out which controllers we need */
 477
 478         if (c->cpu_accounting ||
 479             c->cpu_shares != (unsigned long) -1 ||
 480             c->startup_cpu_shares != (unsigned long) -1 ||
 481             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 482                 mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
 483
 484         if (c->blockio_accounting ||
 485             c->blockio_weight != (unsigned long) -1 ||
 486             c->startup_blockio_weight != (unsigned long) -1 ||
 487             c->blockio_device_weights ||
 488             c->blockio_device_bandwidths)
 489                 mask |= CGROUP_MASK_BLKIO;
 490
 491         if (c->memory_accounting ||
 492             c->memory_limit != (uint64_t) -1)
 493                 mask |= CGROUP_MASK_MEMORY;
 494
 495         if (c->device_allow ||
 496             c->device_policy != CGROUP_AUTO)
 497                 mask |= CGROUP_MASK_DEVICE;
 498
 499         return mask;
 500 }
 501
 502 CGroupMask unit_get_own_mask(Unit *u) {
 503         CGroupContext *c;
 504
 505         /* Returns the mask of controllers the unit needs for itself */
 506
 507         c = unit_get_cgroup_context(u);
 508         if (!c)
 509                 return 0;
 510
 511         /* If delegation is turned on, then turn on all cgroups,
 512          * unless we are on the legacy hierarchy and the process we
 513          * fork into it is known to drop privileges, and hence
 514          * shouldn't get access to the controllers.
 515          *
 516          * Note that on the unified hierarchy it is safe to delegate
 517          * controllers to unprivileged services. */
 518
 519         if (c->delegate) {
 520                 ExecContext *e;
 521
 522                 e = unit_get_exec_context(u);
 523                 if (!e ||
 524                     exec_context_maintains_privileges(e) ||
 525                     cg_unified() > 0)
 526                         return _CGROUP_MASK_ALL;
 527         }
 528
 529         return cgroup_context_get_mask(c);
 530 }
 531
 532 CGroupMask unit_get_members_mask(Unit *u) {
 533         assert(u);
 534
 535         /* Returns the mask of controllers all of the unit's children
 536          * require, merged */
 537
 538         if (u->cgroup_members_mask_valid)
 539                 return u->cgroup_members_mask;
 540
 541         u->cgroup_members_mask = 0;
 542
 543         if (u->type == UNIT_SLICE) {
 544                 Unit *member;
 545                 Iterator i;
 546
 547                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 548
 549                         if (member == u)
 550                                 continue;
 551
 552                         if (UNIT_DEREF(member->slice) != u)
 553                                 continue;
 554
 555                         u->cgroup_members_mask |=
 556                                 unit_get_own_mask(member) |
 557                                 unit_get_members_mask(member);
 558                 }
 559         }
 560
 561         u->cgroup_members_mask_valid = true;
 562         return u->cgroup_members_mask;
 563 }
 564
 565 CGroupMask unit_get_siblings_mask(Unit *u) {
 566         assert(u);
 567
 568         /* Returns the mask of controllers all of the unit's siblings
 569          * require, i.e. the members mask of the unit's parent slice
 570          * if there is one. */
 571
 572         if (UNIT_ISSET(u->slice))
 573                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 574
 575         return unit_get_own_mask(u) | unit_get_members_mask(u);
 576 }
 577
 578 CGroupMask unit_get_subtree_mask(Unit *u) {
 579
 580         /* Returns the mask of this subtree, meaning of the group
 581          * itself and its children. */
 582
 583         return unit_get_own_mask(u) | unit_get_members_mask(u);
 584 }
 585
 586 CGroupMask unit_get_target_mask(Unit *u) {
 587         CGroupMask mask;
 588
 589         /* This returns the cgroup mask of all controllers to enable
 590          * for a specific cgroup, i.e. everything it needs itself,
 591          * plus all that its children need, plus all that its siblings
 592          * need. This is primarily useful on the legacy cgroup
 593          * hierarchy, where we need to duplicate each cgroup in each
 594          * hierarchy that shall be enabled for it. */
 595
 596         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 597         mask &= u->manager->cgroup_supported;
 598
 599         return mask;
 600 }
 601
 602 CGroupMask unit_get_enable_mask(Unit *u) {
 603         CGroupMask mask;
 604
 605         /* This returns the cgroup mask of all controllers to enable
 606          * for the children of a specific cgroup. This is primarily
 607          * useful for the unified cgroup hierarchy, where each cgroup
 608          * controls which controllers are enabled for its children. */
 609
 610         mask = unit_get_members_mask(u);
 611         mask &= u->manager->cgroup_supported;
 612
 613         return mask;
 614 }
 615
 616 /* Recurse from a unit up through its containing slices, propagating
 617  * mask bits upward. A unit is also member of itself. */
 618 void unit_update_cgroup_members_masks(Unit *u) {
 619         CGroupMask m;
 620         bool more;
 621
 622         assert(u);
 623
 624         /* Calculate subtree mask */
 625         m = unit_get_subtree_mask(u);
 626
 627         /* See if anything changed from the previous invocation. If
 628          * not, we're done. */
 629         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 630                 return;
 631
 632         more =
 633                 u->cgroup_subtree_mask_valid &&
 634                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 635                 ((~m & u->cgroup_subtree_mask) == 0);
 636
 637         u->cgroup_subtree_mask = m;
 638         u->cgroup_subtree_mask_valid = true;
 639
 640         if (UNIT_ISSET(u->slice)) {
 641                 Unit *s = UNIT_DEREF(u->slice);
 642
 643                 if (more)
 644                         /* There's more set now than before. We
 645                          * propagate the new mask to the parent's mask
 646                          * (not caring if it actually was valid or
 647                          * not). */
 648
 649                         s->cgroup_members_mask |= m;
 650
 651                 else
 652                         /* There's less set now than before (or we
 653                          * don't know), we need to recalculate
 654                          * everything, so let's invalidate the
 655                          * parent's members mask */
 656
 657                         s->cgroup_members_mask_valid = false;
 658
 659                 /* And now make sure that this change also hits our
 660                  * grandparents */
 661                 unit_update_cgroup_members_masks(s);
 662         }
 663 }
 664
 665 static const char *migrate_callback(CGroupMask mask, void *userdata) {
 666         Unit *u = userdata;
 667
 668         assert(mask != 0);
 669         assert(u);
 670
 671         while (u) {
 672                 if (u->cgroup_path &&
 673                     u->cgroup_realized &&
 674                     (u->cgroup_realized_mask & mask) == mask)
 675                         return u->cgroup_path;
 676
 677                 u = UNIT_DEREF(u->slice);
 678         }
 679
 680         return NULL;
 681 }
 682
 683 char *unit_default_cgroup_path(Unit *u) {
 684         _cleanup_free_ char *escaped = NULL, *slice = NULL;
 685         int r;
 686
 687         assert(u);
 688
 689         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 690                 return strdup(u->manager->cgroup_root);
 691
 692         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
 693                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
 694                 if (r < 0)
 695                         return NULL;
 696         }
 697
 698         escaped = cg_escape(u->id);
 699         if (!escaped)
 700                 return NULL;
 701
 702         if (slice)
 703                 return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
 704         else
 705                 return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
 706 }
 707
 708 int unit_set_cgroup_path(Unit *u, const char *path) {
 709         _cleanup_free_ char *p = NULL;
 710         int r;
 711
 712         assert(u);
 713
 714         if (path) {
 715                 p = strdup(path);
 716                 if (!p)
 717                         return -ENOMEM;
 718         } else
 719                 p = NULL;
 720
 721         if (streq_ptr(u->cgroup_path, p))
 722                 return 0;
 723
 724         if (p) {
 725                 r = hashmap_put(u->manager->cgroup_unit, p, u);
 726                 if (r < 0)
 727                         return r;
 728         }
 729
 730         unit_release_cgroup(u);
 731
 732         u->cgroup_path = p;
 733         p = NULL;
 734
 735         return 1;
 736 }
 737
 738 int unit_watch_cgroup(Unit *u) {
 739         _cleanup_free_ char *populated = NULL;
 740         int r;
 741
 742         assert(u);
 743
 744         if (!u->cgroup_path)
 745                 return 0;
 746
 747         if (u->cgroup_inotify_wd >= 0)
 748                 return 0;
 749
 750         /* Only applies to the unified hierarchy */
 751         r = cg_unified();
 752         if (r < 0)
 753                 return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
 754         if (r == 0)
 755                 return 0;
 756
 757         /* Don't watch the root slice, it's pointless. */
 758         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
 759                 return 0;
 760
 761         r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
 762         if (r < 0)
 763                 return log_oom();
 764
 765         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
 766         if (r < 0)
 767                 return log_oom();
 768
 769         u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
 770         if (u->cgroup_inotify_wd < 0) {
 771
 772                 if (errno == ENOENT) /* If the directory is already
 773                                       * gone we don't need to track
 774                                       * it, so this is not an error */
 775                         return 0;
 776
 777                 return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
 778         }
 779
 780         r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
 781         if (r < 0)
 782                 return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
 783
 784         return 0;
 785 }
 786
 787 static int unit_create_cgroup(
 788                 Unit *u,
 789                 CGroupMask target_mask,
 790                 CGroupMask enable_mask) {
 791
 792         CGroupContext *c;
 793         int r;
 794
 795         assert(u);
 796
 797         c = unit_get_cgroup_context(u);
 798         if (!c)
 799                 return 0;
 800
 801         if (!u->cgroup_path) {
 802                 _cleanup_free_ char *path = NULL;
 803
 804                 path = unit_default_cgroup_path(u);
 805                 if (!path)
 806                         return log_oom();
 807
 808                 r = unit_set_cgroup_path(u, path);
 809                 if (r == -EEXIST)
 810                         return log_unit_error_errno(u, r, "Control group %s exists already.", path);
 811                 if (r < 0)
 812                         return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
 813         }
 814
 815         /* First, create our own group */
 816         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
 817         if (r < 0)
 818                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
 819
 820         /* Start watching it */
 821         (void) unit_watch_cgroup(u);
 822
 823         /* Enable all controllers we need */
 824         r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
 825         if (r < 0)
 826                 log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
 827
 828         /* Keep track that this is now realized */
 829         u->cgroup_realized = true;
 830         u->cgroup_realized_mask = target_mask;
 831
 832         if (u->type != UNIT_SLICE && !c->delegate) {
 833
 834                 /* Then, possibly move things over, but not if
 835                  * subgroups may contain processes, which is the case
 836                  * for slice and delegation units. */
 837                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 838                 if (r < 0)
 839                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
 840         }
 841
 842         return 0;
 843 }
 844
 845 int unit_attach_pids_to_cgroup(Unit *u) {
 846         int r;
 847         assert(u);
 848
 849         r = unit_realize_cgroup(u);
 850         if (r < 0)
 851                 return r;
 852
 853         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 854         if (r < 0)
 855                 return r;
 856
 857         return 0;
 858 }
 859
 860 static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
 861         assert(u);
 862
 863         return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
 864 }
 865
 866 /* Check if necessary controllers and attributes for a unit are in place.
 867  *
 868  * If so, do nothing.
 869  * If not, create paths, move processes over, and set attributes.
 870  *
 871  * Returns 0 on success and < 0 on failure. */
 872 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 873         CGroupMask target_mask, enable_mask;
 874         int r;
 875
 876         assert(u);
 877
 878         if (u->in_cgroup_queue) {
 879                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 880                 u->in_cgroup_queue = false;
 881         }
 882
 883         target_mask = unit_get_target_mask(u);
 884         if (unit_has_mask_realized(u, target_mask))
 885                 return 0;
 886
 887         /* First, realize parents */
 888         if (UNIT_ISSET(u->slice)) {
 889                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 890                 if (r < 0)
 891                         return r;
 892         }
 893
 894         /* And then do the real work */
 895         enable_mask = unit_get_enable_mask(u);
 896         r = unit_create_cgroup(u, target_mask, enable_mask);
 897         if (r < 0)
 898                 return r;
 899
 900         /* Finally, apply the necessary attributes. */
 901         cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
 902
 903         return 0;
 904 }
 905
 906 static void unit_add_to_cgroup_queue(Unit *u) {
 907
 908         if (u->in_cgroup_queue)
 909                 return;
 910
 911         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 912         u->in_cgroup_queue = true;
 913 }
 914
 915 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 916         ManagerState state;
 917         unsigned n = 0;
 918         Unit *i;
 919         int r;
 920
 921         state = manager_state(m);
 922
 923         while ((i = m->cgroup_queue)) {
 924                 assert(i->in_cgroup_queue);
 925
 926                 r = unit_realize_cgroup_now(i, state);
 927                 if (r < 0)
 928                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
 929
 930                 n++;
 931         }
 932
 933         return n;
 934 }
 935
 936 static void unit_queue_siblings(Unit *u) {
 937         Unit *slice;
 938
 939         /* This adds the siblings of the specified unit and the
 940          * siblings of all parent units to the cgroup queue. (But
 941          * neither the specified unit itself nor the parents.) */
 942
 943         while ((slice = UNIT_DEREF(u->slice))) {
 944                 Iterator i;
 945                 Unit *m;
 946
 947                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 948                         if (m == u)
 949                                 continue;
 950
 951                         /* Skip units that have a dependency on the slice
 952                          * but aren't actually in it. */
 953                         if (UNIT_DEREF(m->slice) != slice)
 954                                 continue;
 955
 956                         /* No point in doing cgroup application for units
 957                          * without active processes. */
 958                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 959                                 continue;
 960
 961                         /* If the unit doesn't need any new controllers
 962                          * and has current ones realized, it doesn't need
 963                          * any changes. */
 964                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 965                                 continue;
 966
 967                         unit_add_to_cgroup_queue(m);
 968                 }
 969
 970                 u = slice;
 971         }
 972 }
 973
 974 int unit_realize_cgroup(Unit *u) {
 975         assert(u);
 976
 977         if (!UNIT_HAS_CGROUP_CONTEXT(u))
 978                 return 0;
 979
 980         /* So, here's the deal: when realizing the cgroups for this
 981          * unit, we need to first create all parents, but there's more
 982          * actually: for the weight-based controllers we also need to
 983          * make sure that all our siblings (i.e. units that are in the
 984          * same slice as we are) have cgroups, too. Otherwise, things
 985          * would become very uneven as each of their processes would
 986          * get as much resources as all our group together. This call
 987          * will synchronously create the parent cgroups, but will
 988          * defer work on the siblings to the next event loop
 989          * iteration. */
 990
 991         /* Add all sibling slices to the cgroup queue. */
 992         unit_queue_siblings(u);
 993
 994         /* And realize this one now (and apply the values) */
 995         return unit_realize_cgroup_now(u, manager_state(u->manager));
 996 }
 997
 998 void unit_release_cgroup(Unit *u) {
 999         assert(u);
1000
1001         /* Forgets all cgroup details for this cgroup */
1002
1003         if (u->cgroup_path) {
1004                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
1005                 u->cgroup_path = mfree(u->cgroup_path);
1006         }
1007
1008         if (u->cgroup_inotify_wd >= 0) {
1009                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
1010                         log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
1011
1012                 (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
1013                 u->cgroup_inotify_wd = -1;
1014         }
1015 }
1016
1017 void unit_prune_cgroup(Unit *u) {
1018         int r;
1019         bool is_root_slice;
1020
1021         assert(u);
1022
1023         /* Removes the cgroup, if empty and possible, and stops watching it. */
1024
1025         if (!u->cgroup_path)
1026                 return;
1027
1028         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
1029
1030         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
1031         if (r < 0) {
1032                 log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
1033                 return;
1034         }
1035
1036         if (is_root_slice)
1037                 return;
1038
1039         unit_release_cgroup(u);
1040
1041         u->cgroup_realized = false;
1042         u->cgroup_realized_mask = 0;
1043 }
1044
1045 int unit_search_main_pid(Unit *u, pid_t *ret) {
1046         _cleanup_fclose_ FILE *f = NULL;
1047         pid_t pid = 0, npid, mypid;
1048         int r;
1049
1050         assert(u);
1051         assert(ret);
1052
1053         if (!u->cgroup_path)
1054                 return -ENXIO;
1055
1056         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
1057         if (r < 0)
1058                 return r;
1059
1060         mypid = getpid();
1061         while (cg_read_pid(f, &npid) > 0)  {
1062                 pid_t ppid;
1063
1064                 if (npid == pid)
1065                         continue;
1066
1067                 /* Ignore processes that aren't our kids */
1068                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
1069                         continue;
1070
1071                 if (pid != 0)
1072                         /* Dang, there's more than one daemonized PID
1073                         in this group, so we don't know what process
1074                         is the main process. */
1075
1076                         return -ENODATA;
1077
1078                 pid = npid;
1079         }
1080
1081         *ret = pid;
1082         return 0;
1083 }
1084
1085 static int unit_watch_pids_in_path(Unit *u, const char *path) {
1086         _cleanup_closedir_ DIR *d = NULL;
1087         _cleanup_fclose_ FILE *f = NULL;
1088         int ret = 0, r;
1089
1090         assert(u);
1091         assert(path);
1092
1093         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
1094         if (r < 0)
1095                 ret = r;
1096         else {
1097                 pid_t pid;
1098
1099                 while ((r = cg_read_pid(f, &pid)) > 0) {
1100                         r = unit_watch_pid(u, pid);
1101                         if (r < 0 && ret >= 0)
1102                                 ret = r;
1103                 }
1104
1105                 if (r < 0 && ret >= 0)
1106                         ret = r;
1107         }
1108
1109         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
1110         if (r < 0) {
1111                 if (ret >= 0)
1112                         ret = r;
1113         } else {
1114                 char *fn;
1115
1116                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1117                         _cleanup_free_ char *p = NULL;
1118
1119                         p = strjoin(path, "/", fn, NULL);
1120                         free(fn);
1121
1122                         if (!p)
1123                                 return -ENOMEM;
1124
1125                         r = unit_watch_pids_in_path(u, p);
1126                         if (r < 0 && ret >= 0)
1127                                 ret = r;
1128                 }
1129
1130                 if (r < 0 && ret >= 0)
1131                         ret = r;
1132         }
1133
1134         return ret;
1135 }
1136
1137 int unit_watch_all_pids(Unit *u) {
1138         assert(u);
1139
1140         /* Adds all PIDs from our cgroup to the set of PIDs we
1141          * watch. This is a fallback logic for cases where we do not
1142          * get reliable cgroup empty notifications: we try to use
1143          * SIGCHLD as replacement. */
1144
1145         if (!u->cgroup_path)
1146                 return -ENOENT;
1147
1148         if (cg_unified() > 0) /* On unified we can use proper notifications */
1149                 return 0;
1150
1151         return unit_watch_pids_in_path(u, u->cgroup_path);
1152 }
1153
1154 int unit_notify_cgroup_empty(Unit *u) {
1155         int r;
1156
1157         assert(u);
1158
1159         if (!u->cgroup_path)
1160                 return 0;
1161
1162         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
1163         if (r <= 0)
1164                 return r;
1165
1166         unit_add_to_gc_queue(u);
1167
1168         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1169                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1170
1171         return 0;
1172 }
1173
1174 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1175         Manager *m = userdata;
1176
1177         assert(s);
1178         assert(fd >= 0);
1179         assert(m);
1180
1181         for (;;) {
1182                 union inotify_event_buffer buffer;
1183                 struct inotify_event *e;
1184                 ssize_t l;
1185
1186                 l = read(fd, &buffer, sizeof(buffer));
1187                 if (l < 0) {
1188                         if (errno == EINTR || errno == EAGAIN)
1189                                 return 0;
1190
1191                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
1192                 }
1193
1194                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
1195                         Unit *u;
1196
1197                         if (e->wd < 0)
1198                                 /* Queue overflow has no watch descriptor */
1199                                 continue;
1200
1201                         if (e->mask & IN_IGNORED)
1202                                 /* The watch was just removed */
1203                                 continue;
1204
1205                         u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
1206                         if (!u) /* Not that inotify might deliver
1207                                  * events for a watch even after it
1208                                  * was removed, because it was queued
1209                                  * before the removal. Let's ignore
1210                                  * this here safely. */
1211                                 continue;
1212
1213                         (void) unit_notify_cgroup_empty(u);
1214                 }
1215         }
1216 }
1217 #endif // 0
1218
1219 int manager_setup_cgroup(Manager *m) {
1220         _cleanup_free_ char *path = NULL;
1221         CGroupController c;
1222         int r, unified;
1223         char *e;
1224
1225         assert(m);
1226
1227         /* 1. Determine hierarchy */
1228         m->cgroup_root = mfree(m->cgroup_root);
1229         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
1230         if (r < 0)
1231                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
1232
1233 /// elogind does not support systemd scopes and slices
1234 #if 0
1235         /* Chop off the init scope, if we are already located in it */
1236         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1237
1238         /* LEGACY: Also chop off the system slice if we are in
1239          * it. This is to support live upgrades from older systemd
1240          * versions where PID 1 was moved there. Also see
1241          * cg_get_root_path(). */
1242         if (!e) {
1243                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
1244                 if (!e)
1245                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
1246         }
1247         if (e)
1248                 *e = 0;
1249 #endif // 0
1250
1251         /* And make sure to store away the root value without trailing
1252          * slash, even for the root dir, so that we can easily prepend
1253          * it everywhere. */
1254         while ((e = endswith(m->cgroup_root, "/")))
1255                 *e = 0;
1256         log_debug_elogind("Cgroup Controller \"%s\" -> root \"%s\"",
1257                           SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root);
1258
1259         /* 2. Show data */
1260         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
1261         if (r < 0)
1262                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
1263
1264         unified = cg_unified();
1265         if (unified < 0)
1266                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
1267         if (unified > 0)
1268                 log_debug("Unified cgroup hierarchy is located at %s.", path);
1269         else
1270                 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
1271
1272         if (!m->test_run) {
1273                 const char *scope_path;
1274
1275                 /* 3. Install agent */
1276                 if (unified) {
1277
1278                         /* In the unified hierarchy we can can get
1279                          * cgroup empty notifications via inotify. */
1280 /// elogind does not support the unified hierarchy, yet.
1281 #if 0
1282                         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1283                         safe_close(m->cgroup_inotify_fd);
1284
1285                         m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
1286                         if (m->cgroup_inotify_fd < 0)
1287                                 return log_error_errno(errno, "Failed to create control group inotify object: %m");
1288
1289                         r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
1290                         if (r < 0)
1291                                 return log_error_errno(r, "Failed to watch control group inotify object: %m");
1292
1293                         r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
1294                         if (r < 0)
1295                                 return log_error_errno(r, "Failed to set priority of inotify event source: %m");
1296
1297                         (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
1298
1299 #else
1300                         return log_error_errno(EOPNOTSUPP, "Unified cgroup hierarchy not supported: %m");
1301 #endif // 0
1302                 } else if (m->running_as == MANAGER_SYSTEM) {
1303                         /* On the legacy hierarchy we only get
1304                          * notifications via cgroup agents. (Which
1305                          * isn't really reliable, since it does not
1306                          * generate events when control groups with
1307                          * children run empty. */
1308
1309                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, ELOGIND_CGROUP_AGENT_PATH);
1310                         if (r < 0)
1311                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
1312                         else if (r > 0)
1313                                 log_debug("Installed release agent.");
1314                         else if (r == 0)
1315                                 log_debug("Release agent already installed.");
1316                 }
1317
1318 /// elogind is not meant to run in systemd init scope
1319 #if 0
1320                 /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
1321                 scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
1322                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1323 #else
1324                 if (streq(SYSTEMD_CGROUP_CONTROLLER, "name=elogind"))
1325                         // we are our own cgroup controller
1326                         scope_path = strjoina("");
1327                 else if (streq(m->cgroup_root, "/elogind"))
1328                         // root already is our cgroup
1329                         scope_path = strjoina(m->cgroup_root);
1330                 else
1331                         // we have to create our own group
1332                         scope_path = strjoina(m->cgroup_root, "/elogind");
1333                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
1334 #endif // 0
1335                 if (r < 0)
1336                         return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
1337                 log_debug_elogind("Created control group \"%s\"", scope_path);
1338
1339                 /* also, move all other userspace processes remaining
1340                  * in the root cgroup into that scope. */
1341                 if (!streq(m->cgroup_root, scope_path)) {
1342                         r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
1343                         if (r < 0)
1344                                 log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
1345                 }
1346
1347                 /* 5. And pin it, so that it cannot be unmounted */
1348                 safe_close(m->pin_cgroupfs_fd);
1349                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
1350                 if (m->pin_cgroupfs_fd < 0)
1351                         return log_error_errno(errno, "Failed to open pin file: %m");
1352
1353                 /* 6.  Always enable hierarchical support if it exists... */
1354                 if (!unified)
1355                         (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
1356         }
1357
1358         /* 7. Figure out which controllers are supported */
1359         r = cg_mask_supported(&m->cgroup_supported);
1360         if (r < 0)
1361                 return log_error_errno(r, "Failed to determine supported controllers: %m");
1362
1363         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
1364                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
1365
1366         return 0;
1367 }
1368
1369 void manager_shutdown_cgroup(Manager *m, bool delete) {
1370         assert(m);
1371
1372         /* We can't really delete the group, since we are in it. But
1373          * let's trim it. */
1374         if (delete && m->cgroup_root)
1375                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
1376
1377 /// elogind does not support the unified hierarchy, yet.
1378 #if 0
1379         m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
1380
1381         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
1382         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
1383 #endif // 0
1384
1385         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
1386
1387         m->cgroup_root = mfree(m->cgroup_root);
1388 }
1389
1390 /// UNNEEDED by elogind
1391 #if 0
1392 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
1393         char *p;
1394         Unit *u;
1395
1396         assert(m);
1397         assert(cgroup);
1398
1399         u = hashmap_get(m->cgroup_unit, cgroup);
1400         if (u)
1401                 return u;
1402
1403         p = strdupa(cgroup);
1404         for (;;) {
1405                 char *e;
1406
1407                 e = strrchr(p, '/');
1408                 if (!e || e == p)
1409                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
1410
1411                 *e = 0;
1412
1413                 u = hashmap_get(m->cgroup_unit, p);
1414                 if (u)
1415                         return u;
1416         }
1417 }
1418
1419 Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
1420         _cleanup_free_ char *cgroup = NULL;
1421         int r;
1422
1423         assert(m);
1424
1425         if (pid <= 0)
1426                 return NULL;
1427
1428         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1429         if (r < 0)
1430                 return NULL;
1431
1432         return manager_get_unit_by_cgroup(m, cgroup);
1433 }
1434
1435 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1436         Unit *u;
1437
1438         assert(m);
1439
1440         if (pid <= 0)
1441                 return NULL;
1442
1443         if (pid == 1)
1444                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
1445
1446         u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
1447         if (u)
1448                 return u;
1449
1450         u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
1451         if (u)
1452                 return u;
1453
1454         return manager_get_unit_by_pid_cgroup(m, pid);
1455 }
1456
1457 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1458         Unit *u;
1459
1460         assert(m);
1461         assert(cgroup);
1462
1463         u = manager_get_unit_by_cgroup(m, cgroup);
1464         if (!u)
1465                 return 0;
1466
1467         return unit_notify_cgroup_empty(u);
1468 }
1469
1470 int unit_get_memory_current(Unit *u, uint64_t *ret) {
1471         _cleanup_free_ char *v = NULL;
1472         int r;
1473
1474         assert(u);
1475         assert(ret);
1476
1477         if (!u->cgroup_path)
1478                 return -ENODATA;
1479
1480         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
1481                 return -ENODATA;
1482
1483         if (cg_unified() <= 0)
1484                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1485         else
1486                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
1487         if (r == -ENOENT)
1488                 return -ENODATA;
1489         if (r < 0)
1490                 return r;
1491
1492         return safe_atou64(v, ret);
1493 }
1494
1495 static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1496         _cleanup_free_ char *v = NULL;
1497         uint64_t ns;
1498         int r;
1499
1500         assert(u);
1501         assert(ret);
1502
1503         if (!u->cgroup_path)
1504                 return -ENODATA;
1505
1506         if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
1507                 return -ENODATA;
1508
1509         r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1510         if (r == -ENOENT)
1511                 return -ENODATA;
1512         if (r < 0)
1513                 return r;
1514
1515         r = safe_atou64(v, &ns);
1516         if (r < 0)
1517                 return r;
1518
1519         *ret = ns;
1520         return 0;
1521 }
1522
1523 int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1524         nsec_t ns;
1525         int r;
1526
1527         r = unit_get_cpu_usage_raw(u, &ns);
1528         if (r < 0)
1529                 return r;
1530
1531         if (ns > u->cpuacct_usage_base)
1532                 ns -= u->cpuacct_usage_base;
1533         else
1534                 ns = 0;
1535
1536         *ret = ns;
1537         return 0;
1538 }
1539
1540 int unit_reset_cpu_usage(Unit *u) {
1541         nsec_t ns;
1542         int r;
1543
1544         assert(u);
1545
1546         r = unit_get_cpu_usage_raw(u, &ns);
1547         if (r < 0) {
1548                 u->cpuacct_usage_base = 0;
1549                 return r;
1550         }
1551
1552         u->cpuacct_usage_base = ns;
1553         return 0;
1554 }
1555
1556 bool unit_cgroup_delegate(Unit *u) {
1557         CGroupContext *c;
1558
1559         assert(u);
1560
1561         c = unit_get_cgroup_context(u);
1562         if (!c)
1563                 return false;
1564
1565         return c->delegate;
1566 }
1567
1568 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1569         [CGROUP_AUTO] = "auto",
1570         [CGROUP_CLOSED] = "closed",
1571         [CGROUP_STRICT] = "strict",
1572 };
1573
1574 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
1575 #endif // 0